├── README.md ├── build.sbt ├── project └── plugins.sbt ├── src └── main │ ├── resources │ ├── application.conf │ ├── items.txt │ └── ratings.txt │ └── scala │ ├── Test.scala │ ├── colfilt2 │ └── CollaborativeFiltering.scala │ ├── import1 │ └── Import.scala │ ├── model │ └── Model.scala │ └── utils │ ├── Converter.scala │ ├── CustomObjectInputStream.scala │ ├── EsClient.scala │ └── Settings.scala └── workshop-spark.pdf /README.md: -------------------------------------------------------------------------------- 1 | spark-elasticsearch-mllib 2 | ========================= 3 | 4 | ScalaIO 2014 Workshop 5 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | scalacOptions in (Compile, console) += "-Yrepl-sync" 2 | 3 | organization := "com.ebiznext.scalaio" 4 | 5 | name := "import" 6 | 7 | version := "0.1.0-SNAPSHOT" 8 | 9 | scalaVersion := "2.10.4" 10 | 11 | resolvers += "Typesafe Releases" at "http://repo.typesafe.com/typesafe/releases/" 12 | 13 | resolvers += "Conjars" at "http://conjars.org/repo" 14 | 15 | resolvers += "cljars" at "https://clojars.org/repo/" 16 | 17 | val jacksonV = "2.4.3" 18 | 19 | val elastic4sV = "1.3.2" 20 | 21 | val elasticSearchV = "1.3.2" 22 | 23 | val sparkV = "1.1.0" 24 | 25 | libraryDependencies ++= Seq( 26 | "com.typesafe" % "config" % "1.0.2", 27 | "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonV, 28 | "com.fasterxml.jackson.core" % "jackson-annotations" % jacksonV, 29 | "com.fasterxml.jackson.core" % "jackson-core" % jacksonV, 30 | "com.fasterxml.jackson.core" % "jackson-databind" % jacksonV, 31 | "com.sksamuel.elastic4s" %% "elastic4s" % elastic4sV exclude("org.elasticsearch", "elasticsearch"), 32 | "org.elasticsearch" % "elasticsearch" % elasticSearchV, 33 | "org.apache.spark" %% "spark-core" % sparkV, 34 | "org.apache.spark" %% "spark-mllib" % sparkV, 35 | "org.elasticsearch" % "elasticsearch-hadoop" % "2.1.0.Beta2", 36 | "org.apache.mesos" % "mesos" % "0.18.1" exclude("com.google.protobuf", "protobuf-java"), 37 | "org.specs2" %% "specs2" % "2.3.13" % "test" 38 | ) 39 | 40 | packAutoSettings 41 | 42 | 43 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.xerial.sbt" % "sbt-pack" % "0.6.2") 2 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | elasticsearch { 2 | date.format = "yyyy-MM-dd'T'HH:mm:ss.SSSZZ" 3 | host = "127.0.0.1" 4 | http.port = 19200 5 | port = 19300 6 | index = "scalaio2014" 7 | cluster = "scalaiocluster" 8 | } 9 | 10 | 11 | spark { 12 | master = "spark://127.0.0.1:19400" 13 | appName = "scalaio2014" 14 | ratingsFile = "ratings.txt" 15 | itemsFile = "items.txt" 16 | dataPath ="/Users/hayssams/git/public/scalaio-2014/workshop/src/main/resources/" 17 | jarPath = "/Users/hayssams/git/public/scalaio-2014/workshop/target/pack/lib/" 18 | } -------------------------------------------------------------------------------- /src/main/resources/items.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiznext/spark-elasticsearch-mllib/4902f7c28546371ff292923377672676ad33e98b/src/main/resources/items.txt -------------------------------------------------------------------------------- /src/main/scala/Test.scala: -------------------------------------------------------------------------------- 1 | import java.util.Date 2 | 3 | import scala.reflect.runtime.universe._ 4 | 5 | /** 6 | * Created by hayssams on 16/10/14. 7 | */ 8 | object Test extends App { 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/colfilt2/CollaborativeFiltering.scala: -------------------------------------------------------------------------------- 1 | package colfilt2 2 | 3 | import java.io.File 4 | 5 | import com.sksamuel.elastic4s.ElasticDsl._ 6 | import import1.Import 7 | import model.Model.{Rating, User} 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel} 10 | import org.apache.spark.rdd.RDD 11 | import utils.{EsClient, Settings} 12 | 13 | object CollaborativeFiltering extends App { 14 | def sparkInit(): SparkContext = { 15 | import org.apache.spark.{SparkConf, SparkContext} 16 | val conf = new SparkConf().setAppName(Settings.Spark.AppName).setMaster(Settings.Spark.Master) 17 | conf.set("es.nodes", "localhost") 18 | conf.set("es.port", "19200") 19 | val sparkContext = new SparkContext(conf) 20 | 21 | val libdir = new File(Settings.Spark.JarPath) 22 | libdir.list().foreach(jar => sparkContext.addJar(Settings.Spark.JarPath + jar)) 23 | sparkContext 24 | } 25 | 26 | 27 | def summary(): Unit = { 28 | import org.elasticsearch.spark._ 29 | val sparkContext = sparkInit() 30 | val esType = s"${Settings.ElasticSearch.Index}/${EsClient.esType[Rating]}" 31 | val ratings = sparkContext.esRDD(esType) 32 | val users = ratings.map(_._2("userid")).distinct 33 | val nbUsers = users.count 34 | val nbItems = ratings.map(_._2("itemid")).distinct.count 35 | println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 36 | println(s"$nbUsers users rated $nbItems") 37 | println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") 38 | 39 | // val res: Seq[String] = rdd.map(_._2("itemid").toString).countByValue().toSeq.sortBy(-_._2).take(50).map(_._1) 40 | // res.foreach(println) 41 | 42 | sparkContext.stop() 43 | } 44 | 45 | def als() = { 46 | import org.elasticsearch.spark._ 47 | val sparkContext = sparkInit() 48 | val esType = s"${Settings.ElasticSearch.Index}/${EsClient.esType[Rating]}" 49 | val esWildcardQuery = search in Settings.ElasticSearch.Index -> EsClient.esType[Rating] query { matchall } 50 | 51 | val ratings = sparkContext.esRDD(esType) 52 | 53 | import org.apache.spark.mllib.recommendation.Rating 54 | 55 | val allData = sparkContext.esRDD(esType, esWildcardQuery._builder.toString).cache() 56 | 57 | val allDataCount = allData.count() 58 | 59 | val trainingSet = allData.filter { x => 60 | val index = x._2("index").toString.toLong 61 | index % 10 != 1 && index % 10 != 2 62 | }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache() 63 | 64 | val validatingSet = allData.filter { x => 65 | val index = x._2("index").toString.toLong 66 | index % 10 == 1 67 | }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache() 68 | 69 | val testingSet = allData.filter { x => 70 | val index = x._2("index").toString.toLong 71 | index % 10 == 2 72 | }.map(x => Rating(x._2("userid").toString.toInt, x._2("itemid").toString.toInt, x._2("rating").toString.toDouble)).cache() 73 | 74 | val numTraining = trainingSet.count() 75 | val numValidation = validatingSet.count() 76 | val numTest = testingSet.count() 77 | 78 | 79 | 80 | val ranks = List(12) 81 | val lambdas = List(0.16) 82 | val numIters = List(30) 83 | var bestModel: Option[MatrixFactorizationModel] = None 84 | var bestValidationRmse = Double.MaxValue 85 | var bestRank = 0 86 | var bestLambda = -1.0 87 | var bestNumIter = -1 88 | 89 | def rmse(model: MatrixFactorizationModel, data: RDD[Rating]) = { 90 | import org.apache.spark.SparkContext._ 91 | val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) 92 | val predictionsAndRatings = predictions.map(x => ((x.user, x.product), x.rating)).join(data.map(x => ((x.user, x.product), x.rating))).values 93 | math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) 94 | } 95 | 96 | for (rank <- ranks; lambda <- lambdas; numIter <- numIters) { 97 | val model = ALS.train(trainingSet, rank, numIter, lambda) 98 | val validationRmse = rmse(model, validatingSet) 99 | println("RMSE (validation) = " + validationRmse + " for the model trained with rank = " 100 | + rank + ", lambda = " + lambda + ", and numIter = " + numIter + ".") 101 | 102 | println(trainingSet.count() + "////" + testingSet.count() + "////" + validatingSet.count()) 103 | 104 | if (validationRmse < bestValidationRmse) { 105 | bestModel = Some(model) 106 | bestValidationRmse = validationRmse 107 | bestRank = rank 108 | bestLambda = lambda 109 | bestNumIter = numIter 110 | } 111 | } 112 | val testRmse = rmse(bestModel.get, testingSet) 113 | 114 | val users = trainingSet.map(_.user).distinct.collect() 115 | 116 | //println("================" + users.length) 117 | 118 | val usersRecommendations = users.map { userId => 119 | val recommendations = bestModel.get.recommendProducts(userId, 10).map(_.product) 120 | User(userId, recommendations) 121 | } 122 | 123 | Import.bulkIndex(usersRecommendations.toList) 124 | 125 | import org.apache.spark.SparkContext._ 126 | val meanRating = trainingSet.union(validatingSet).map(_.rating).mean() 127 | 128 | val baselineRmse = 129 | math.sqrt(testingSet.map((x: Rating) => (meanRating - x.rating) * (meanRating - x.rating)).mean) 130 | val improvement = (baselineRmse - testRmse) / baselineRmse * 100 131 | println("The best model improves the baseline by " + "%1.2f".format(improvement) + "%.") 132 | 133 | println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 134 | println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 135 | println("vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") 136 | println("The best model was trained with rank = " + bestRank + " and lambda = " + bestLambda + ", and numIter = " + bestNumIter + ", and its RMSE on the test set is " + testRmse + ".") 137 | println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") 138 | println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") 139 | println("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^") 140 | 141 | sparkContext.stop() 142 | 143 | } 144 | 145 | als() 146 | Thread.sleep(30000) 147 | } 148 | -------------------------------------------------------------------------------- /src/main/scala/import1/Import.scala: -------------------------------------------------------------------------------- 1 | package import1 2 | 3 | // imports - will be omitted for other examples 4 | 5 | 6 | import java.io.File 7 | import java.util.Date 8 | 9 | import model.Model.Rating 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.rdd.RDD 12 | import org.elasticsearch.action.bulk.BulkResponse 13 | import utils.{EsClient, Settings} 14 | 15 | import scala.util.control.NonFatal 16 | 17 | object Import extends App { 18 | 19 | import com.sksamuel.elastic4s.ElasticDsl._ 20 | import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse 21 | 22 | def deleteIndex: DeleteIndexResponse = { 23 | val deleteIndexDefinition = delete index ("scalaio2014") 24 | EsClient().execute(deleteIndexDefinition).await 25 | } 26 | 27 | import org.elasticsearch.action.admin.indices.create.CreateIndexResponse 28 | 29 | def createIndex: CreateIndexResponse = { 30 | import com.sksamuel.elastic4s.mappings.FieldType.{IntegerType, StringType} 31 | val createIndexDefinition = create index "scalaio2014" mappings( 32 | "Rating" as( 33 | "index" typed IntegerType, 34 | "userid" typed IntegerType, 35 | "movieid" typed IntegerType, 36 | "rating" typed IntegerType, 37 | "timestamp" typed IntegerType 38 | ), 39 | "User" as( 40 | "userid" typed IntegerType, 41 | "movieid" typed StringType 42 | ) 43 | ) 44 | EsClient().execute(createIndexDefinition).await 45 | } 46 | 47 | 48 | def bulkIndex[T <: {def toMap() : Map[String, Any]} : Manifest](items: List[T]): BulkResponse = { 49 | 50 | val indexDefinitions = items.map(item => index into s"${Settings.ElasticSearch.Index}/${EsClient.esType[T]}" fields (item.toMap())) 51 | val bulkDefinition = bulk( 52 | indexDefinitions: _* 53 | ) 54 | println(bulkDefinition._builder.requests().toString) 55 | EsClient().execute(bulkDefinition).await 56 | } 57 | 58 | 59 | def sparkInit(): SparkContext = { 60 | import org.apache.spark.{SparkConf, SparkContext} 61 | val conf = new SparkConf().setAppName(Settings.Spark.AppName).setMaster(Settings.Spark.Master) 62 | 63 | val sparkContext = new SparkContext(conf) 64 | 65 | val libdir = new File(Settings.Spark.JarPath) 66 | libdir.list().foreach(jar => sparkContext.addJar(Settings.Spark.JarPath + jar)) 67 | sparkContext 68 | } 69 | 70 | def loadFileSpark() = { 71 | val sparkContext = sparkInit() 72 | 73 | val ratingLines = sparkContext.textFile(Settings.Spark.RatingsFile) 74 | 75 | var i = 0 76 | val ratings: RDD[Rating] = ratingLines.map { ratingLine => 77 | val ratingFields = ratingLine.split("\\s+") 78 | i = i + 1 79 | Rating(i, ratingFields(0).toInt, ratingFields(1).toInt, ratingFields(2).toInt, ratingFields(3).toLong) 80 | } cache() 81 | 82 | ratings.foreach { 83 | rating => bulkIndex(List(rating)) 84 | } 85 | 86 | sparkContext.stop() 87 | } 88 | 89 | def loadFileLocal(): Unit = { 90 | val lines = scala.tools.nsc.io.File(Settings.Spark.RatingsFile).lines() 91 | var i = 0 92 | val ratings = lines.map { ratingLine => 93 | val ratingFields = ratingLine.split("\\s+") 94 | i = i + 1 95 | Rating(i, ratingFields(0).toInt, ratingFields(1).toInt, ratingFields(2).toInt, ratingFields(3).toLong) 96 | } 97 | ratings.foreach { rating => 98 | bulkIndex(List(rating)) 99 | } 100 | } 101 | 102 | def doItAll(loader: () => Unit): Unit = { 103 | try { 104 | deleteIndex 105 | 106 | } 107 | catch { 108 | case NonFatal(e) => e.printStackTrace();println("No index need to be deleted") 109 | } 110 | 111 | createIndex 112 | println("Index created") 113 | val start = new Date().getTime 114 | loader() 115 | val duration = (new Date().getTime - start) / 1000 116 | println(s"duration=$duration") 117 | } 118 | 119 | def doItAllLocal(): Unit = { 120 | doItAll(loadFileLocal) 121 | } 122 | 123 | def doItAllSpark(): Unit = { 124 | doItAll(loadFileSpark) 125 | } 126 | 127 | doItAllSpark() 128 | } 129 | -------------------------------------------------------------------------------- /src/main/scala/model/Model.scala: -------------------------------------------------------------------------------- 1 | package model 2 | 3 | /** 4 | * Created by hayssams on 20/10/14. 5 | */ 6 | object Model { 7 | 8 | case class Rating(val index : Int, val userid: Int, itemid: Int, rating: Int, timestamp: Long) { 9 | def toMap(): Map[String, Any] = Map("index" -> index, "userid" -> userid, "itemid" -> itemid, "rating" -> rating, "timestamp" -> timestamp) 10 | } 11 | 12 | case class Item(val itemid: Int, name: String, timestamp: Long, unknown: Boolean, action: Boolean, adventure: Boolean, animation: Boolean, 13 | children: Boolean, comedy: Boolean, crime: Boolean, documentary: Boolean, drama: Boolean, fantasy: Boolean, filmnoir: Boolean, horror: Boolean, 14 | musical: Boolean, mystery: Boolean, romance: Boolean, scifi: Boolean, thriller: Boolean, war: Boolean, western: Boolean) { 15 | def toMap(): Map[String, Any] = Map("itemid" -> itemid, "name" -> name, "timestamp" -> timestamp, "unknown" -> unknown, 16 | "action" -> action, "adventure" -> adventure, "animation" -> animation, "children" -> children, "comedy" -> comedy, "crime" -> crime, "documentary" -> documentary, 17 | "drama" -> drama, "fantasy" -> fantasy, "filmnoir" -> filmnoir, "horror" -> horror, "musical" -> musical, "mystery" -> mystery, "romance" -> romance, "scifi" -> scifi, 18 | "thriller" -> thriller, "war" -> war, "western" -> western) 19 | } 20 | 21 | 22 | case class User(val userid:Int, itemids:Array[Int]) { 23 | def toMap(): Map[String, Any] = Map("userid" -> userid, "itemids" -> itemids.map(_.toString).mkString(",")) 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/utils/Converter.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import java.io.{BufferedOutputStream, ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | 5 | import com.fasterxml.jackson.core.`type`.TypeReference 6 | import com.fasterxml.jackson.databind.ObjectMapper 7 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 8 | 9 | import scala.Array.canBuildFrom 10 | 11 | /** 12 | * Generic Object Converter 13 | * Binary converter based on Java standard serializer 14 | * A performance improvement would be to rely on https://code.google.com/p/kryo/ 15 | * 16 | * JSON converter based on jackson scala module 17 | */ 18 | trait Converter[T] { 19 | def toDomain[T: Manifest](obj: Array[Byte]): T 20 | 21 | def fromDomain[T: Manifest](value: T): Array[Byte] 22 | } 23 | 24 | 25 | trait BinaryConverter[T] extends Converter[T] { 26 | def toDomain[T: Manifest](obj: Array[Byte]): T = safeDecode(obj) 27 | 28 | def fromDomain[T: Manifest](value: T): Array[Byte] = { 29 | val bos = new ByteArrayOutputStream() 30 | val out = new ObjectOutputStream(new BufferedOutputStream(bos)) 31 | out writeObject (value) 32 | out close() 33 | bos toByteArray() 34 | } 35 | 36 | def safeDecode[T: Manifest](bytes: Array[Byte]) = { 37 | val cl = Option(this.getClass().getClassLoader()) 38 | val cin = cl match { 39 | case Some(cls) => 40 | new CustomObjectInputStream(new ByteArrayInputStream(bytes), cls) 41 | case None => 42 | new ObjectInputStream(new ByteArrayInputStream(bytes)) 43 | } 44 | val obj = cin.readObject 45 | cin.close 46 | obj.asInstanceOf[T] 47 | } 48 | } 49 | 50 | trait JSONConverter[T] extends Converter[T] { 51 | def toDomain[T: Manifest](bytes: Array[Byte]): T = { 52 | val x: Option[T] = None 53 | JacksonConverter.deserialize[T](new String(bytes)) 54 | } 55 | 56 | def fromDomain[T: Manifest](value: T): Array[Byte] = { 57 | JacksonConverter.serialize(value) map (_.toChar) toCharArray() map (_.toByte) 58 | } 59 | } 60 | 61 | object JacksonConverter { 62 | 63 | import java.lang.reflect._ 64 | 65 | lazy val mapper = new ObjectMapper().registerModule(DefaultScalaModule) 66 | 67 | def serialize(value: Any): String = { 68 | mapper.writeValueAsString(value) 69 | } 70 | 71 | def deserialize[T: Manifest](json: String): T = mapper.readValue(json, typeReference[T]) 72 | 73 | private[this] def typeReference[T: Manifest] = new TypeReference[T] { 74 | override def getType: Type = typeFromManifest(manifest[T]) 75 | } 76 | 77 | private[this] def typeFromManifest(m: Manifest[_]): Type = { 78 | if (m.typeArguments.isEmpty) { 79 | m.runtimeClass 80 | } 81 | else new ParameterizedType { 82 | def getRawType = m.runtimeClass 83 | 84 | def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray 85 | 86 | def getOwnerType = null 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/utils/CustomObjectInputStream.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | /** 4 | * Created by hayssams on 04/03/14. 5 | */ 6 | 7 | import java.io.{InputStream, ObjectInputStream, ObjectStreamClass} 8 | 9 | import scala.Array.canBuildFrom 10 | 11 | /** 12 | * Handle when running thorugh SBT and forking is not activated 13 | */ 14 | class CustomObjectInputStream(in: InputStream, cl: ClassLoader) extends ObjectInputStream(in) { 15 | override def resolveClass(cd: ObjectStreamClass): Class[_] = 16 | try { 17 | cl.loadClass(cd.getName()) 18 | } catch { 19 | case cnf: ClassNotFoundException => 20 | super.resolveClass(cd) 21 | } 22 | 23 | override def resolveProxyClass(interfaces: Array[String]): Class[_] = 24 | try { 25 | val ifaces = interfaces map { iface => cl.loadClass(iface)} 26 | java.lang.reflect.Proxy.getProxyClass(cl, ifaces: _*) 27 | } catch { 28 | case e: ClassNotFoundException => 29 | super.resolveProxyClass(interfaces) 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/scala/utils/EsClient.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import java.util.Date 4 | 5 | import com.sksamuel.elastic4s.ElasticClient 6 | import com.sksamuel.elastic4s.ElasticDsl._ 7 | import com.sksamuel.elastic4s.source.DocumentSource 8 | import org.elasticsearch.common.settings.ImmutableSettings 9 | import org.elasticsearch.search.SearchHit 10 | 11 | import scala.concurrent.ExecutionContext.Implicits.global 12 | import scala.concurrent._ 13 | 14 | 15 | object EsClient { 16 | val settings = ImmutableSettings.settingsBuilder().put("cluster.name", Settings.ElasticSearch.Cluster).build() 17 | val client = ElasticClient.remote(settings, (Settings.ElasticSearch.Host, Settings.ElasticSearch.Port)) 18 | 19 | def apply(): ElasticClient = client 20 | 21 | def index[T: Manifest](t: T, refresh: Boolean = true): String = { 22 | val json = JacksonConverter.serialize(t) 23 | val res = client.client.prepareIndex(Settings.ElasticSearch.Index, manifest[T].runtimeClass.getSimpleName) 24 | .setSource(json) 25 | .setRefresh(refresh) 26 | .execute() 27 | .actionGet() 28 | res.getId 29 | } 30 | 31 | def load[T: Manifest](uuid: String): Future[Option[T]] = { 32 | val req = get id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName 33 | val res = client.execute(req) 34 | res map { res => 35 | if (res.isExists) Some(JacksonConverter.deserialize[T](res.getSourceAsString)) else None 36 | } 37 | } 38 | 39 | def loadWithVersion[T: Manifest](uuid: String): Future[Option[(T, Long)]] = { 40 | val req = get id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName 41 | val res = client.execute(req) 42 | res map { res => 43 | val maybeT = if (res.isExists) Some(JacksonConverter.deserialize[T](res.getSourceAsString)) else None 44 | maybeT map ((_, res.getVersion)) 45 | } 46 | } 47 | 48 | def delete[T: Manifest](uuid: String, refresh: Boolean): Future[Boolean] = { 49 | val req = com.sksamuel.elastic4s.ElasticDsl.delete id uuid from Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName refresh refresh 50 | val res = client.execute(req) 51 | res map { res => 52 | res.isFound 53 | } 54 | } 55 | 56 | def update[T: Manifest](uuid: String, t: T, upsert: Boolean, refresh: Boolean): Future[Boolean] = { 57 | val js = JacksonConverter.serialize(t) 58 | val req = com.sksamuel.elastic4s.ElasticDsl.update id uuid in Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName refresh refresh doc new DocumentSource { 59 | override def json: String = js 60 | } 61 | req.docAsUpsert(upsert) 62 | val res = client.execute(req) 63 | res.map { res => 64 | res.isCreated || res.getVersion > 1 65 | } 66 | } 67 | 68 | def updateWithVersion[T: Manifest](uuid: String, t: T, version: Long) = { 69 | val js = JacksonConverter.serialize(t) 70 | val req = com.sksamuel.elastic4s.ElasticDsl.update id uuid in Settings.ElasticSearch.Index -> manifest[T].runtimeClass.getSimpleName version version doc new DocumentSource { 71 | override def json: String = js 72 | } 73 | val res = client.execute(req) 74 | true 75 | } 76 | 77 | 78 | def searchAll[T: Manifest](req: SearchDefinition): Future[Seq[T]] = { 79 | val res = client.execute(req.size(Integer.MAX_VALUE)) 80 | res.map { res => 81 | res.getHits.getHits.map { hit => JacksonConverter.deserialize[T](hit.getSourceAsString)} 82 | } 83 | } 84 | 85 | def search[T: Manifest](req: SearchDefinition): Future[Option[T]] = { 86 | val res = client.execute(req.size(Integer.MAX_VALUE)) 87 | res.map { res => 88 | if (res.getHits.getTotalHits == 0) 89 | None 90 | else 91 | Some(JacksonConverter.deserialize[T](res.getHits.getHits()(0).getSourceAsString)) 92 | } 93 | } 94 | 95 | def searchAllRaw(req: SearchDefinition): Future[Array[SearchHit]] = { 96 | val res = client.execute(req.size(Integer.MAX_VALUE)) 97 | res.map { res => 98 | res.getHits.getHits 99 | } 100 | } 101 | 102 | def searchRaw(req: SearchDefinition): Future[Option[SearchHit]] = { 103 | val res = client.execute(req.size(Integer.MAX_VALUE)) 104 | res.map { res => 105 | if (res.getHits.getTotalHits == 0) 106 | None 107 | else 108 | Some(res.getHits.getHits()(0)) 109 | } 110 | } 111 | def esType[T:Manifest]: String = { 112 | val rt = manifest[T].runtimeClass 113 | rt.getSimpleName 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/utils/Settings.scala: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import com.typesafe.config.ConfigFactory 4 | 5 | object Settings { 6 | private val config = ConfigFactory.load() 7 | 8 | val Env = if (System.getenv.containsKey("PRODUCTION")) { 9 | Environment.PROD 10 | } else { 11 | Environment.DEV 12 | } 13 | 14 | object ElasticSearch { 15 | val DateFormat = config.getString("elasticsearch.date.format") 16 | val Host = config.getString("elasticsearch.host") 17 | val HttpPort = config.getInt("elasticsearch.http.port") 18 | val Port = config.getInt("elasticsearch.port") 19 | val Index = config.getString("elasticsearch.index") 20 | val Cluster = config.getString("elasticsearch.cluster") 21 | val FullUrl = Host + ":" + HttpPort 22 | println("ElascticSearch on " + Host + ":" + Port + ",index->" + Index + ", cluster->" + Cluster) 23 | } 24 | 25 | object Spark { 26 | val Master = config.getString("spark.master") 27 | val AppName = config.getString("spark.appName") 28 | val DataPath = config.getString("spark.dataPath") 29 | val RatingsFile = DataPath + config.getString("spark.ratingsFile") 30 | val ItemsFile = DataPath + config.getString("spark.itemsFile") 31 | val JarPath= config.getString("spark.jarPath") 32 | 33 | } 34 | 35 | } 36 | 37 | object Environment extends Enumeration { 38 | type Environment = Value 39 | val DEV = Value(1) 40 | val PROD = Value(2) 41 | } 42 | -------------------------------------------------------------------------------- /workshop-spark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ebiznext/spark-elasticsearch-mllib/4902f7c28546371ff292923377672676ad33e98b/workshop-spark.pdf --------------------------------------------------------------------------------