├── .gitignore
├── README
├── build.sbt
└── src
    └── main
        ├── resources
            ├── application.conf
            └── log4j.properties
        └── scala
            └── com
                └── edwardsbean
                    └── spark
                        ├── AppDownloadCount.scala
                        └── SearchCount.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | .idea
3 | target
4 | *.iml
5 | project
6 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | A real time compute project base on:
2 | - flume,ingest data
3 | - spark streaming,analyse data
4 | - redis,data store
5 | 
6 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "flume-spark-streaming"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.3"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.0"
 8 | 
 9 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.0.0"
10 | 
11 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.0.0"
12 | 
13 | libraryDependencies += "redis.clients" % "jedis" % "2.4.2"
14 | 
15 | 


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | spark.master.ip="local[2]"
2 | spark.listen.ip=localhost
3 | spark.listen.port=5555
4 | spark.redis.ip=localhost
5 | spark.redis.port=6379


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO


--------------------------------------------------------------------------------
/src/main/scala/com/edwardsbean/spark/AppDownloadCount.scala:
--------------------------------------------------------------------------------
  1 | package com.edwardsbean.spark
  2 | 
  3 | import java.util.concurrent.TimeUnit
  4 | 
  5 | import com.typesafe.config.ConfigFactory
  6 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig
  7 | import org.apache.spark.broadcast.Broadcast
  8 | import org.apache.spark.storage.StorageLevel
  9 | import org.apache.spark.streaming.StreamingContext._
 10 | import org.apache.spark.streaming._
 11 | import org.apache.spark.streaming.dstream.DStream
 12 | import org.apache.spark.streaming.flume.{SparkFlumeEvent, FlumeUtils}
 13 | import redis.clients.jedis.{JedisPool, Jedis}
 14 | 
 15 | /**
 16 |  *
 17 |  * 实时统计每个App的下载量,总下载量
 18 |  *
 19 |  * spark中保存1个字典：用户下载记录，用于查看最近下载同一款App是否一个小时内
 20 |  * redis中保存1个字典：App下载量，聚合用
 21 |  */
 22 | object AppDownloadCount {
 23 | 
 24 |   //123 熊猫看书 1410505200000
 25 |   //123 91desktop 1410505200000
 26 |   case class UserDownload(imei: String, appName: String, timestamp: Long)
 27 | 
 28 | 
 29 |   def etl(flumeEvent: SparkFlumeEvent): Boolean = {
 30 |     val raw = new String(flumeEvent.event.getBody.array())
 31 |     val pairs = raw.split(" ")
 32 |     pairs.size == 3
 33 |   }
 34 | 
 35 |   def parseRawUserDownload(flumeEvent: SparkFlumeEvent): UserDownload = {
 36 |     val raw = new String(flumeEvent.event.getBody.array())
 37 |     val pairs = raw.split(" ")
 38 |     UserDownload(pairs(0), pairs(1), pairs(2).toLong)
 39 |   }
 40 | 
 41 | 
 42 |   //  def AppDownloadCountAndWindow(source: DStream[UserDownload],duration:Duration): DStream[(String, Int)] = {
 43 |   //    source.map(userDownload => (userDownload.appName,1))
 44 |   //      .reduceByKeyAndWindow(_+_,duration)
 45 |   //  }
 46 | 
 47 |   def AppDownloadCount(source: DStream[UserDownload], dict: DStream[(String, Long)]): DStream[(String, Int)] = {
 48 |     val downloads = source.map(userDownload => (combine(userDownload.imei, userDownload.appName), userDownload.timestamp))
 49 |     downloads.cogroup(dict).map { case (key: String, (download: Iterable[Long], history: Iterable[Long])) =>
 50 |       val currentTime = download.head
 51 |       val previousTime = history.head
 52 |       val outputKey = getAppName(key)
 53 |       val time = TimeUnit.MILLISECONDS.toHours(currentTime - previousTime)
 54 |       if (time < 1) (outputKey, 0) else (outputKey, 1)
 55 |     }.reduceByKey(_ + _)
 56 |   }
 57 | 
 58 |   def updateAppDownload(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
 59 |     val currentCount = newValues.sum
 60 |     val previousCount = runningCount.getOrElse(0)
 61 |     Some(currentCount + previousCount)
 62 |   }
 63 | 
 64 |   def updateDict(newValues: Seq[Long], runningCount: Option[Long]): Option[Long] = {
 65 |     val currentCount = newValues.max
 66 |     val previousCount = runningCount.getOrElse(0.toLong)
 67 |     if (previousCount < currentCount) Some(currentCount) else Some(previousCount)
 68 |   }
 69 | 
 70 |   def createRedisPool(host: String, port: Int, pwd: String): JedisPool = {
 71 |     val pc = new GenericObjectPoolConfig()
 72 |     pc.setMaxIdle(5)
 73 |     pc.setMaxTotal(5)
 74 |     new JedisPool(pc, host, port, 10000, pwd)
 75 |   }
 76 | 
 77 |   def combine(imei: String, appName: String): String = {
 78 |     imei + "," + appName
 79 |   }
 80 | 
 81 |   def getAppName(combine: String): String = {
 82 |     combine.split(",")(1)
 83 |   }
 84 | 
 85 |   def sinkToRedis(downloadCount: DStream[(String, Int)], pool: Broadcast[JedisPool]): Unit = {
 86 |     downloadCount.foreachRDD(rdd => {
 87 |       rdd.foreachPartition(partitionOfRecords => {
 88 |         val jedis = pool.value.getResource
 89 |         partitionOfRecords.foreach { case (appName: String, downCount: Int) =>
 90 |           //聚合成总下载量
 91 |           jedis.hincrBy(appName, "totalDownloadCount", downCount)
 92 |           //单位时间内的下载量
 93 |           jedis.hset(appName, "downloadCount", downCount + "")
 94 |         }
 95 |         pool.value.returnResource(jedis)
 96 |       })
 97 |     })
 98 |   }
 99 | 
100 |   def main(args: Array[String]) {
101 |     val config = ConfigFactory.load()
102 |     val master = config.getString("spark.master.ip")
103 |     val ip = config.getString("spark.listen.ip")
104 |     val port = config.getInt("spark.listen.port")
105 |     val redisIp = config.getString("spark.redis.ip")
106 |     val redisPort = config.getInt("spark.redis.port")
107 |     val redisPwd = config.getString("spark.redis.pwd")
108 | 
109 |     val ssc = new StreamingContext(master, "AppDownloadCount", Seconds(1))
110 |     val source = FlumeUtils.createStream(ssc, ip, port, StorageLevel.MEMORY_ONLY)
111 | 
112 |     //使用线程池复用链接
113 |     val pool = {
114 |       val pool = createRedisPool(redisIp, redisPort, redisPwd)
115 |       ssc.sparkContext.broadcast(pool)
116 |     }
117 | 
118 |     //预处理
119 |     val cleanSource = source.filter(etl).map(parseRawUserDownload).cache()
120 | 
121 |     /**
122 |      * TODO
123 |      * 如何清除过期字典
124 |      */
125 |     val dict = cleanSource.map { userDownload =>
126 |       (combine(userDownload.imei, userDownload.appName), userDownload.timestamp)
127 |     }
128 |     //实时字典：用户最近一次下载记录作为字典
129 |     val currentDict = dict.updateStateByKey(updateDict)
130 | 
131 |     //统计用户下载记录
132 |     val downloadCount = AppDownloadCount(cleanSource, currentDict)
133 | 
134 |     //打印
135 |     downloadCount.print()
136 | 
137 |     //输出到Redis
138 |     sinkToRedis(downloadCount, pool)
139 | 
140 | 
141 |     ssc.start()
142 |     ssc.awaitTermination()
143 |   }
144 | 
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/com/edwardsbean/spark/SearchCount.scala:
--------------------------------------------------------------------------------
 1 | package com.edwardsbean.spark
 2 | 
 3 | 
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.storage.StorageLevel
 6 | import org.apache.spark.streaming.StreamingContext._
 7 | import org.apache.spark.streaming._
 8 | import org.apache.spark.streaming.dstream.DStream
 9 | import org.apache.spark.streaming.flume.{SparkFlumeEvent, FlumeUtils}
10 | import redis.clients.jedis.Jedis
11 | 
12 | /**
13 |  * 实时统计搜索热词
14 |  *
15 |  * 1:实时Top 10热词
16 |  * 2:实时增长最快
17 |  * 3:实时增长
18 |  *
19 |  * Created by edwardsbean on 14-9-15.
20 |  */
21 | object SearchCount {
22 | 
23 |   val jedis = new Jedis("localhost")
24 |   case class SearchRecord(imei:String, requestTime:String, searchWord:String)
25 | 
26 |   def main(args: Array[String]) {
27 |     val ssc = new StreamingContext("local[2]", "SearchWordCount", Seconds(1))
28 |     //监听数据
29 |     val searchSource: DStream[SearchRecord] = FlumeUtils.createStream(ssc,"localhost",5555,StorageLevel.MEMORY_ONLY)
30 |       //读取flume的数据
31 |       .map(flumeEvent => new String(flumeEvent.event.getBody.array()).split(" "))
32 |       //格式化成对象SearchRecord
33 |       .map(row => SearchRecord(row(0),row(1),row(2)))
34 | 
35 |     //计算 => (搜索词,搜索量)
36 |     val searchResult:DStream[(String,Int)] = searchSource.map(searchRecord => (searchRecord.searchWord,1)).reduceByKey(_+_)
37 | 
38 |     //输出到redis
39 |     searchResult.foreachRDD(rdd => rdd.foreach { x =>
40 |       val (searchWord, searchCount) = x
41 |       jedis.zadd("hostSearch",searchCount,searchWord)
42 |       jedis.zincrby("hotSearchTotal",searchCount,searchWord)
43 |     })
44 | 
45 | //    searchResult.foreachRDD(rdd => rdd.map(x => (x._2,x._1)).)
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------