├── .gitignore ├── README ├── build.sbt └── src └── main ├── resources ├── application.conf └── log4j.properties └── scala └── com └── edwardsbean └── spark ├── AppDownloadCount.scala └── SearchCount.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | .idea 3 | target 4 | *.iml 5 | project 6 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | A real time compute project base on: 2 | - flume,ingest data 3 | - spark streaming,analyse data 4 | - redis,data store 5 | 6 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "flume-spark-streaming" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.3" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.0" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "1.0.0" 10 | 11 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % "1.0.0" 12 | 13 | libraryDependencies += "redis.clients" % "jedis" % "2.4.2" 14 | 15 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | spark.master.ip="local[2]" 2 | spark.listen.ip=localhost 3 | spark.listen.port=5555 4 | spark.redis.ip=localhost 5 | spark.redis.port=6379 -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -------------------------------------------------------------------------------- /src/main/scala/com/edwardsbean/spark/AppDownloadCount.scala: -------------------------------------------------------------------------------- 1 | package com.edwardsbean.spark 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import com.typesafe.config.ConfigFactory 6 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig 7 | import org.apache.spark.broadcast.Broadcast 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.streaming.StreamingContext._ 10 | import org.apache.spark.streaming._ 11 | import org.apache.spark.streaming.dstream.DStream 12 | import org.apache.spark.streaming.flume.{SparkFlumeEvent, FlumeUtils} 13 | import redis.clients.jedis.{JedisPool, Jedis} 14 | 15 | /** 16 | * 17 | * 实时统计每个App的下载量,总下载量 18 | * 19 | * spark中保存1个字典:用户下载记录,用于查看最近下载同一款App是否一个小时内 20 | * redis中保存1个字典:App下载量,聚合用 21 | */ 22 | object AppDownloadCount { 23 | 24 | //123 熊猫看书 1410505200000 25 | //123 91desktop 1410505200000 26 | case class UserDownload(imei: String, appName: String, timestamp: Long) 27 | 28 | 29 | def etl(flumeEvent: SparkFlumeEvent): Boolean = { 30 | val raw = new String(flumeEvent.event.getBody.array()) 31 | val pairs = raw.split(" ") 32 | pairs.size == 3 33 | } 34 | 35 | def parseRawUserDownload(flumeEvent: SparkFlumeEvent): UserDownload = { 36 | val raw = new String(flumeEvent.event.getBody.array()) 37 | val pairs = raw.split(" ") 38 | UserDownload(pairs(0), pairs(1), pairs(2).toLong) 39 | } 40 | 41 | 42 | // def AppDownloadCountAndWindow(source: DStream[UserDownload],duration:Duration): DStream[(String, Int)] = { 43 | // source.map(userDownload => (userDownload.appName,1)) 44 | // .reduceByKeyAndWindow(_+_,duration) 45 | // } 46 | 47 | def AppDownloadCount(source: DStream[UserDownload], dict: DStream[(String, Long)]): DStream[(String, Int)] = { 48 | val downloads = source.map(userDownload => (combine(userDownload.imei, userDownload.appName), userDownload.timestamp)) 49 | downloads.cogroup(dict).map { case (key: String, (download: Iterable[Long], history: Iterable[Long])) => 50 | val currentTime = download.head 51 | val previousTime = history.head 52 | val outputKey = getAppName(key) 53 | val time = TimeUnit.MILLISECONDS.toHours(currentTime - previousTime) 54 | if (time < 1) (outputKey, 0) else (outputKey, 1) 55 | }.reduceByKey(_ + _) 56 | } 57 | 58 | def updateAppDownload(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { 59 | val currentCount = newValues.sum 60 | val previousCount = runningCount.getOrElse(0) 61 | Some(currentCount + previousCount) 62 | } 63 | 64 | def updateDict(newValues: Seq[Long], runningCount: Option[Long]): Option[Long] = { 65 | val currentCount = newValues.max 66 | val previousCount = runningCount.getOrElse(0.toLong) 67 | if (previousCount < currentCount) Some(currentCount) else Some(previousCount) 68 | } 69 | 70 | def createRedisPool(host: String, port: Int, pwd: String): JedisPool = { 71 | val pc = new GenericObjectPoolConfig() 72 | pc.setMaxIdle(5) 73 | pc.setMaxTotal(5) 74 | new JedisPool(pc, host, port, 10000, pwd) 75 | } 76 | 77 | def combine(imei: String, appName: String): String = { 78 | imei + "," + appName 79 | } 80 | 81 | def getAppName(combine: String): String = { 82 | combine.split(",")(1) 83 | } 84 | 85 | def sinkToRedis(downloadCount: DStream[(String, Int)], pool: Broadcast[JedisPool]): Unit = { 86 | downloadCount.foreachRDD(rdd => { 87 | rdd.foreachPartition(partitionOfRecords => { 88 | val jedis = pool.value.getResource 89 | partitionOfRecords.foreach { case (appName: String, downCount: Int) => 90 | //聚合成总下载量 91 | jedis.hincrBy(appName, "totalDownloadCount", downCount) 92 | //单位时间内的下载量 93 | jedis.hset(appName, "downloadCount", downCount + "") 94 | } 95 | pool.value.returnResource(jedis) 96 | }) 97 | }) 98 | } 99 | 100 | def main(args: Array[String]) { 101 | val config = ConfigFactory.load() 102 | val master = config.getString("spark.master.ip") 103 | val ip = config.getString("spark.listen.ip") 104 | val port = config.getInt("spark.listen.port") 105 | val redisIp = config.getString("spark.redis.ip") 106 | val redisPort = config.getInt("spark.redis.port") 107 | val redisPwd = config.getString("spark.redis.pwd") 108 | 109 | val ssc = new StreamingContext(master, "AppDownloadCount", Seconds(1)) 110 | val source = FlumeUtils.createStream(ssc, ip, port, StorageLevel.MEMORY_ONLY) 111 | 112 | //使用线程池复用链接 113 | val pool = { 114 | val pool = createRedisPool(redisIp, redisPort, redisPwd) 115 | ssc.sparkContext.broadcast(pool) 116 | } 117 | 118 | //预处理 119 | val cleanSource = source.filter(etl).map(parseRawUserDownload).cache() 120 | 121 | /** 122 | * TODO 123 | * 如何清除过期字典 124 | */ 125 | val dict = cleanSource.map { userDownload => 126 | (combine(userDownload.imei, userDownload.appName), userDownload.timestamp) 127 | } 128 | //实时字典:用户最近一次下载记录作为字典 129 | val currentDict = dict.updateStateByKey(updateDict) 130 | 131 | //统计用户下载记录 132 | val downloadCount = AppDownloadCount(cleanSource, currentDict) 133 | 134 | //打印 135 | downloadCount.print() 136 | 137 | //输出到Redis 138 | sinkToRedis(downloadCount, pool) 139 | 140 | 141 | ssc.start() 142 | ssc.awaitTermination() 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/com/edwardsbean/spark/SearchCount.scala: -------------------------------------------------------------------------------- 1 | package com.edwardsbean.spark 2 | 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.storage.StorageLevel 6 | import org.apache.spark.streaming.StreamingContext._ 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.streaming.dstream.DStream 9 | import org.apache.spark.streaming.flume.{SparkFlumeEvent, FlumeUtils} 10 | import redis.clients.jedis.Jedis 11 | 12 | /** 13 | * 实时统计搜索热词 14 | * 15 | * 1:实时Top 10热词 16 | * 2:实时增长最快 17 | * 3:实时增长 18 | * 19 | * Created by edwardsbean on 14-9-15. 20 | */ 21 | object SearchCount { 22 | 23 | val jedis = new Jedis("localhost") 24 | case class SearchRecord(imei:String, requestTime:String, searchWord:String) 25 | 26 | def main(args: Array[String]) { 27 | val ssc = new StreamingContext("local[2]", "SearchWordCount", Seconds(1)) 28 | //监听数据 29 | val searchSource: DStream[SearchRecord] = FlumeUtils.createStream(ssc,"localhost",5555,StorageLevel.MEMORY_ONLY) 30 | //读取flume的数据 31 | .map(flumeEvent => new String(flumeEvent.event.getBody.array()).split(" ")) 32 | //格式化成对象SearchRecord 33 | .map(row => SearchRecord(row(0),row(1),row(2))) 34 | 35 | //计算 => (搜索词,搜索量) 36 | val searchResult:DStream[(String,Int)] = searchSource.map(searchRecord => (searchRecord.searchWord,1)).reduceByKey(_+_) 37 | 38 | //输出到redis 39 | searchResult.foreachRDD(rdd => rdd.foreach { x => 40 | val (searchWord, searchCount) = x 41 | jedis.zadd("hostSearch",searchCount,searchWord) 42 | jedis.zincrby("hotSearchTotal",searchCount,searchWord) 43 | }) 44 | 45 | // searchResult.foreachRDD(rdd => rdd.map(x => (x._2,x._1)).) 46 | } 47 | } 48 | --------------------------------------------------------------------------------