├── .gitignore ├── README ├── build.sbt ├── src └── main │ ├── resources │ ├── application.conf │ ├── application.conf~ │ └── log4j.properties │ └── scala │ └── org │ └── github │ └── drunk2013 │ └── spark │ └── streaming │ ├── FlumeNginxAnalyze.scala │ ├── KafkaDemo.scala │ ├── KafkaNginxAnalyze.scala │ ├── model │ └── AccessInfo.scala │ └── util │ └── Util.scala └── update.sh /.gitignore: -------------------------------------------------------------------------------- 1 | *.jar 2 | target/ 3 | *.dll 4 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 1、从flume-ng获取数据,通过spark-streaming进行统计计算,最后发送到mysql中 2 | - flume,ingest data 3 | - spark streaming,analyse data 4 | - redis,data store 5 | 6 | 7 | 2、从kafka获取数据 8 | 然后格式化成DF,进行sql统计分析 9 | 10 | 3、test 11 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-streaming" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.3" 6 | 7 | val sparkVersion = "1.4.0" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion 10 | 11 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % sparkVersion 12 | 13 | libraryDependencies += "org.apache.spark" %% "spark-streaming-flume" % sparkVersion 14 | 15 | libraryDependencies += "redis.clients" % "jedis" % "2.4.2" 16 | 17 | libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion 18 | 19 | libraryDependencies += "org.apache.spark" %% "spark-streaming-kafka" % sparkVersion 20 | 21 | libraryDependencies += "org.apache.kafka" %% "kafka" % "0.8.1.1" 22 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | task.name=nginxAnalyze 2 | batch.interval.size=5 3 | mysql.hostname=localhost 4 | mysql.db=spark 5 | mysql.port=3306 6 | mysql.username=spark 7 | mysql.password=spark 8 | client.type=1 9 | -------------------------------------------------------------------------------- /src/main/resources/application.conf~: -------------------------------------------------------------------------------- 1 | task.name=nginxAnalyze 2 | batch.interval.size=5 3 | mysql.hostname=localhost 4 | mysql.db=spark 5 | mysql.port=3306 6 | mysql.username=spark 7 | mysql.password=spark 8 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 11 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -------------------------------------------------------------------------------- /src/main/scala/org/github/drunk2013/spark/streaming/FlumeNginxAnalyze.scala: -------------------------------------------------------------------------------- 1 | package org.github.drunk2013.spark.streaming 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import com.typesafe.config.ConfigFactory 6 | import org.apache.commons.pool2.impl.GenericObjectPoolConfig 7 | import org.apache.spark.broadcast.Broadcast 8 | import org.apache.spark.storage.StorageLevel 9 | import org.apache.spark.streaming.StreamingContext._ 10 | import org.apache.spark.streaming._ 11 | import org.apache.spark.streaming.dstream.DStream 12 | import org.apache.spark.streaming.flume.{SparkFlumeEvent, FlumeUtils} 13 | import org.apache.spark.SparkContext._ 14 | //import redis.clients.jedis.{JedisPool, Jedis} 15 | import org.apache.spark.util.IntParam 16 | import org.apache.spark.SparkConf 17 | 18 | import java.sql.DriverManager 19 | import org.apache.spark.SparkContext 20 | import org.apache.spark.rdd.JdbcRDD 21 | import java.sql.{DriverManager, PreparedStatement, Connection} 22 | import java.util._ 23 | 24 | /** 25 | * 26 | * 实时统计每个nginx数据的UV,PV,流量 27 | * 28 | */ 29 | 30 | object FlumeNginxAnalyze { 31 | 32 | var mysql_url: String = "" 33 | var mysql_username: String = "" 34 | var mysql_password: String = "" 35 | var mysql_hostname: String = "" 36 | var mysql_db: String = "" 37 | var mysql_port: String = "" 38 | var client_type:Int = 0 39 | //192.168.45.212,2015-06-04 13:33,28,adfasdfasdfasdfas 40 | case class AccessInfo(ip: String, time: String, http_method: String ,access_url: String ,http_version: String,http_status: String , traffic: Long ,referrers: String,agents: String, cookie: String) 41 | 42 | // 43 | case class result(count: Int, pv: Int, uv:Int, traffic: Long) 44 | 45 | def etl(flumeEvent: SparkFlumeEvent): Boolean = { 46 | val raw = new String(flumeEvent.event.getBody.array()) 47 | val regex="""(.*) - - \[(.*)\] "([A-Z]+) (.*) (.*) ([\d]+) ([\d]+) (.*) "(.*)" "(.*)""".r 48 | var flag = false 49 | try { 50 | val regex(ips,tm,http_method,access_url,http_version,httd_status,traffic,referrers,agents,cookie) = raw 51 | flag = true 52 | } catch { 53 | case e: Exception => false 54 | } 55 | if(!flag){ 56 | println(raw) 57 | } 58 | flag 59 | 60 | } 61 | 62 | def parseRawAccessInfo(flumeEvent: SparkFlumeEvent): AccessInfo = { 63 | val raw = new String(flumeEvent.event.getBody.array()) 64 | 65 | val regex="""(.*) - - \[(.*)\] "([A-Z]+) (.*) (.*) ([\d]+) ([\d]+) (.*) "(.*)" "(.*)""".r 66 | val regex(ips,tm,http_method,access_url,http_version,http_status,traffic,referrers,agents,cookie) = raw 67 | AccessInfo(ips, tm, http_method, access_url, http_version, http_status ,traffic.toLong,referrers,agents,cookie) 68 | } 69 | 70 | 71 | def trafficCount(source: DStream[AccessInfo]): DStream[(String, Long)] = { 72 | source.map{accessInfo => 73 | val traffic = accessInfo.traffic 74 | val ip = accessInfo.ip 75 | ("1",traffic) 76 | }.reduceByKey(_ + _) 77 | 78 | //println("end:===============") 79 | } 80 | 81 | def pvCount(source: DStream[AccessInfo]): DStream[(String, Long)] = { 82 | source.map{accessInfo => 83 | ("1",1.toLong) 84 | }.reduceByKey(_ + _) 85 | } 86 | 87 | 88 | def uvList(source: DStream[AccessInfo]): DStream[(String, Long)] = { 89 | source.map{accessInfo => 90 | (accessInfo.cookie,1.toLong) 91 | }.reduceByKey(_ + _) 92 | } 93 | 94 | def uvCount(source: DStream[(String, Long)]): DStream[(String, Long)] = { 95 | source.map{accessInfo => 96 | ("1",1.toLong) 97 | }.reduceByKey(_ + _) 98 | } 99 | 100 | //发送到mysql中 101 | def sinkToMysql(traffic: DStream[(String, Long)],pv: DStream[(String, Long)],uv: DStream[(String, Long)],uv_list: DStream[(String, Long)]): Unit = { 102 | uv_list.foreachRDD(rdd => { 103 | rdd.foreachPartition(partitionOfRecords => { 104 | partitionOfRecords.foreach { case (cookie: String, accessCount: Long) => 105 | //println(cookie) 106 | //println(accessCount) 107 | var conn: Connection = null 108 | val sql = "insert into uvList(cookie,count,client_type) values (?,?,?)" 109 | //conn = DriverManager.getConnection("jdbc:mysql://"+mysql_hostname+":"+mysql_port+"/"+mysql_db,mysql_username, mysql_password) 110 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test",mysql_username, mysql_password) 111 | 112 | var ps: PreparedStatement = null 113 | ps = conn.prepareStatement(sql) 114 | try { 115 | ps.setString(1,cookie) 116 | ps.setLong(2,accessCount) 117 | ps.setInt(3,client_type) 118 | ps.executeUpdate() 119 | } catch { 120 | case e: Exception => println("Mysql Exception"+e) 121 | } finally { 122 | if (ps != null) { 123 | ps.close() 124 | } 125 | if (conn != null) { 126 | conn.close() 127 | } 128 | } 129 | } 130 | }) 131 | }) 132 | 133 | //流量 134 | traffic.cogroup(pv).cogroup(uv).foreachRDD(rdd =>{ 135 | rdd.foreachPartition(partitionOfRecords => { 136 | partitionOfRecords.foreach { case (key: String, value2: ((Iterable[Long],Iterable[Long]),Iterable[Long])) => 137 | val (traffic,pv) = getNu(value2._1.toString()) 138 | val uv = value2._2.toList(0) 139 | //写入到mysql 140 | var conn: Connection = null 141 | val sql = "insert into pvflow(pv,uv,traffic,client_type) values (?,?,?,?)" 142 | //conn = DriverManager.getConnection("jdbc:mysql://"+mysql_hostname+":"+mysql_port+"/"+mysql_db,mysql_username, mysql_password) 143 | conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/test",mysql_username, mysql_password) 144 | 145 | var ps: PreparedStatement = null 146 | ps = conn.prepareStatement(sql) 147 | try { 148 | ps.setLong(1, pv.toLong) 149 | ps.setLong(2, uv.toLong) 150 | ps.setLong(3, traffic.toLong) 151 | ps.setInt(4,client_type) 152 | ps.executeUpdate() 153 | } catch { 154 | case e: Exception => println("Mysql Exception"+e) 155 | } finally { 156 | if (ps != null) { 157 | ps.close() 158 | } 159 | if (conn != null) { 160 | conn.close() 161 | } 162 | } 163 | } 164 | }) 165 | }) 166 | 167 | } 168 | 169 | def getNu(str: String): (String, String) = { 170 | val regex="""CompactBuffer\(\(CompactBuffer\(([0-9]+)\),CompactBuffer\(([0-9]+)\)\)\)""".r 171 | val regex(traffic,pv) = str 172 | (traffic,pv) 173 | } 174 | 175 | def main(args: Array[String]) { 176 | //val config = ConfigFactory.load() 177 | //val task_name = config.getString("task.name") 178 | //mysql_hostname = config.getString("mysql.hostname") 179 | //mysql_port = config.getString("mysql.port") 180 | //mysql_db = config.getString("mysql.db") 181 | //mysql_username = config.getString("mysql.username") 182 | //mysql_password = config.getString("mysql.password") 183 | //client_type = config.getInt("client.type") 184 | 185 | 186 | 187 | //val ssc = new StreamingContext(master, task_name, Seconds(batch_interval)) 188 | //ssc.checkpoint("hdfs://localhost:8020/checkpoint") 189 | 190 | println("start sparkstreaming............") 191 | 192 | val Array(host, port,batch_interval,mysql_hostname,mysql_port,mysql_db,mysql_username,mysql_password,client_type,task_name) = args 193 | println(host) 194 | val batchInterval = Milliseconds(batch_interval.toInt) 195 | val sparkConf = new SparkConf().setAppName(task_name) 196 | val ssc = new StreamingContext(sparkConf, batchInterval) 197 | //val source = FlumeUtils.createStream(ssc, ip, port, StorageLevel.MEMORY_ONLY) 198 | val source = FlumeUtils.createStream(ssc,host, port.toInt, StorageLevel.MEMORY_ONLY_SER_2) 199 | 200 | //使用线程池复用链接 201 | //val pool = { 202 | // val pool = createRedisPool(redisIp, redisPort, redisPwd) 203 | // ssc.sparkContext.broadcast(pool) 204 | //} 205 | 206 | //预处理 207 | val cleanSource = source.filter(etl).map(parseRawAccessInfo).cache() 208 | 209 | //cleanSource.print() 210 | /** 211 | * TODO 212 | * 如何清除过期字典 213 | */ 214 | //val dict = cleanSource.map { userDownload => 215 | // (combine(userDownload.imei, userDownload.appName), userDownload.timestamp) 216 | //} 217 | //dict.print() 218 | //实时字典:用户最近一次下载记录作为字典 219 | //val currentDict = dict.updateStateByKey(updateDict) 220 | 221 | //统计用户下载记录 222 | //val downloadCount = AppDownloadCount(cleanSource, currentDict) 223 | val traffic = trafficCount(cleanSource) 224 | val pv = pvCount(cleanSource) 225 | val uv_list = uvList(cleanSource) 226 | val uv = uvCount(uv_list) 227 | 228 | //打印 229 | //traffic.print() 230 | //pv.print() 231 | //uv.print() 232 | 233 | println("\n\n\n==========================sparkstreaming is running............") 234 | //输出到Mysql 235 | sinkToMysql(traffic,pv,uv,uv_list) 236 | 237 | ssc.start() 238 | ssc.awaitTermination() 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /src/main/scala/org/github/drunk2013/spark/streaming/KafkaDemo.scala: -------------------------------------------------------------------------------- 1 | package org.github.drunk2013.spark.streaming 2 | import java.util.HashMap 3 | 4 | import org.apache.kafka.clients.producer.{ProducerConfig, KafkaProducer, ProducerRecord} 5 | 6 | import org.apache.spark.streaming._ 7 | import org.apache.spark.streaming.kafka._ 8 | import org.apache.spark.SparkConf 9 | 10 | object KafkaDemo{ 11 | 12 | def main(args: Array[String]) { 13 | println("start sparkstreaming............") 14 | 15 | //val Array(zkQuorum, group, topics, numThreads) = args 16 | val sparkConf = new SparkConf().setAppName("KafkaWordCount") 17 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 18 | ssc.checkpoint("checkpoint") 19 | 20 | var tops : String="test" 21 | //val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 22 | val topicMap = tops.split(",").map((_, 1)).toMap 23 | val lines = KafkaUtils.createStream(ssc, "localhost", "spark", topicMap).map(_._2) 24 | val words = lines.flatMap(_.split(" ")) 25 | val wordCounts = words.map(x => (x, 1L)).reduceByKeyAndWindow(_ + _, _ - _, Minutes(10), Seconds(2), 2) 26 | wordCounts.print() 27 | 28 | ssc.start() 29 | ssc.awaitTermination() 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/org/github/drunk2013/spark/streaming/KafkaNginxAnalyze.scala: -------------------------------------------------------------------------------- 1 | package org.github.drunk2013.spark.streaming 2 | import java.util.HashMap 3 | 4 | import org.apache.kafka.clients.producer.{ProducerConfig, KafkaProducer, ProducerRecord} 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.streaming.Time 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.streaming.kafka._ 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.SparkContext 11 | import org.apache.spark.streaming.{Time, Seconds, StreamingContext} 12 | import org.apache.spark.util.IntParam 13 | import org.apache.spark.sql.SQLContext 14 | import org.apache.spark.storage.StorageLevel 15 | 16 | object KafkaNginxAnalyze{ 17 | 18 | def main(args: Array[String]) { 19 | println("start sparkstreaming............") 20 | 21 | //val Array(zkQuorum, group, topics, numThreads) = args 22 | val sparkConf = new SparkConf().setAppName("KafkaWordCount") 23 | val ssc = new StreamingContext(sparkConf, Seconds(4)) 24 | //ssc.checkpoint("checkpoint") 25 | 26 | var tops : String="test" 27 | //val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 28 | val topicMap = tops.split(",").map((_, 1)).toMap 29 | val lines = KafkaUtils.createStream(ssc, "localhost", "spark", topicMap).map(_._2) 30 | 31 | //每条数据的分隔符 32 | val words = lines.flatMap(_.split("\n")) 33 | //words.print() 34 | words.foreachRDD{rdd: RDD[String] => 35 | val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext) 36 | import sqlContext.implicits._ 37 | // 数据格式化成表 38 | val studentDataFrame = rdd.map(_.split(",")).map(row => Student(row(0).toString,row(1).toString,row(2).toInt)).toDF() 39 | studentDataFrame.registerTempTable("student") 40 | // SQL统计数据 41 | val result = sqlContext.sql("select id, count(*) as count from student group by id") 42 | 43 | result.show() 44 | 45 | } 46 | //lines.print() 47 | 48 | ssc.start() 49 | ssc.awaitTermination() 50 | } 51 | 52 | } 53 | 54 | /** 定义表结构 */ 55 | case class Student(id: String,name: String,age: Int) 56 | 57 | 58 | /** Lazily instantiated singleton instance of SQLContext */ 59 | object SQLContextSingleton { 60 | 61 | @transient private var instance: SQLContext = _ 62 | 63 | def getInstance(sparkContext: SparkContext): SQLContext = { 64 | if (instance == null) { 65 | instance = new SQLContext(sparkContext) 66 | } 67 | instance 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/org/github/drunk2013/spark/streaming/model/AccessInfo.scala: -------------------------------------------------------------------------------- 1 | package org.github.drunk2013.spark.streaming.model 2 | 3 | //模式匹配类 4 | //case class AccessInfo{ 5 | // var ip: String 6 | // var time: String 7 | // var http_method: String 8 | // var access_url: String 9 | // var http_version: String 10 | // var http_status: String 11 | // var traffic: Long 12 | // var referrers: String 13 | // var agents: String 14 | // var cookie: String 15 | //} 16 | 17 | //class AccessInfo(ip: String, time: String, http_method: String ,access_url: String ,http_version: String,http_status: String , traffic: Long ,referrers: String,agents: String, cookie: String) 18 | 19 | -------------------------------------------------------------------------------- /src/main/scala/org/github/drunk2013/spark/streaming/util/Util.scala: -------------------------------------------------------------------------------- 1 | package org.github.drunk2013.spark.streaming.util 2 | -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sbt package 3 | rm -rf /usr/lib/spark/lib/spark-streaming_2.10-1.0.jar 4 | cp target/scala-2.10/spark-streaming_2.10-1.0.jar /usr/lib/spark/lib/ 5 | --------------------------------------------------------------------------------