├── src
├── main
│ ├── scala
│ │ ├── common
│ │ │ ├── mycallback.scala
│ │ │ ├── kryoSerializer.scala
│ │ │ ├── eventRow.scala
│ │ │ ├── KafkaSink.scala
│ │ │ ├── Args.scala
│ │ │ └── igniteWriter.scala
│ │ ├── objectProject
│ │ │ ├── structuredStreamingKafkaToIgnitePerformance.scala
│ │ │ ├── streamingKafkaToIgnitePerformance.scala
│ │ │ └── dataImportKafkaPerformance.scala
│ │ └── textProject
│ │ │ ├── structuredStreamingKafkaToIgnitePerformance.scala
│ │ │ ├── streamingKafkaToIgnitePerformance.scala
│ │ │ └── dataImportKafkaPerformance.scala
│ └── java
│ │ └── mycallback.java
└── assembly
│ └── bin.xml
├── bin
├── dataImportKafkaPerformance.sh
├── sparkstreamingkafkaperformance.sh
├── structuredStreamingkafkaperformance.sh
└── pef.sh
├── smokeData
└── imputData.md
├── README.md
├── PerformanceTestResult.md
├── KafkaTuning.md
├── configFile
└── ignite-template.xml
├── FunctionTestResult.md
└── pom.xml
/src/main/scala/common/mycallback.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import org.apache.kafka.clients.producer.{Callback, RecordMetadata}
4 |
5 | class mycallback extends Callback{
6 | override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = ???
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/mycallback.java:
--------------------------------------------------------------------------------
1 | import org.apache.kafka.clients.producer.Callback;
2 | import org.apache.kafka.clients.producer.RecordMetadata;
3 |
4 | public class mycallback implements Callback {
5 | @Override
6 | public void onCompletion(RecordMetadata metadata, Exception exception) {
7 |
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/bin/dataImportKafkaPerformance.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | spark2-submit \
3 | --class dataImportKafkaPerformance \
4 | --master yarn \
5 | --deploy-mode client \
6 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
7 | -cachename yc \
8 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
9 | -brokers datanode1:9092 \
10 | -partitionNum 44 \
11 | -groupid yc \
12 | -hiveTableName default.mm \
13 | -topic yc \
14 | -appName kafkainput
--------------------------------------------------------------------------------
/bin/sparkstreamingkafkaperformance.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | spark2-submit \
3 | --class streamingKafkaToIgnitePerformance \
4 | --master yarn \
5 | --deploy-mode client \
6 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
7 | -cachename yc \
8 | -igniteconfxml /opt/ignite/config/default-config.xml \
9 | -brokers datanode1:9092 \
10 | -partitionNum 44 \
11 | -groupid yc \
12 | -hiveTableName default.mm \
13 | -topic yc \
14 | -appName streamingToIgnite
15 |
--------------------------------------------------------------------------------
/bin/structuredStreamingkafkaperformance.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | spark2-submit \
3 | --class structuredStreamingKafkaToIgnitePerformance \
4 | --master yarn \
5 | --deploy-mode client \
6 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
7 | -cachename yc \
8 | -igniteconfxml /opt/ignite/config/default-config.xml \
9 | -brokers datanode1:9092 \
10 | -partitionNum 44 \
11 | -groupid yc \
12 | -hiveTableName default.mm \
13 | -topic yc \
14 | -appName streamingToIgnite
15 |
--------------------------------------------------------------------------------
/smokeData/imputData.md:
--------------------------------------------------------------------------------
1 | 20180201,115655,200000010000001,1,7983,1,10000.00 ,S,62259910005001,11000001,其他代码,11/10/2018,01
2 | 20180201,115656,200000010000002,1,7983,1,10000.00 ,S,62259910005002,11000002,其他代码,11/11/2018,02
3 | 20180201,115657,200000010000003,1,7983,1,10000.00 ,S,62259910005003,11000003,其他代码,11/12/2018,03
4 | 20180201,115658,200000010000004,1,7983,1,10000.00 ,S,62259910005004,11000001,其他代码,11/13/2018,04
5 | 20180201,115659,200000010000005,1,7983,1,10000.00 ,S,62259910005005,11000002,其他代码,11/14/2018,05
6 | 20180201,115660,200000010000006,1,7983,1,10000.00 ,S,62259910005006,11000003,其他代码,11/15/2018,06
7 | 20180201,115661,200000010000007,1,7983,1,10000.00 ,S,62259910005007,11000001,其他代码,11/16/2018,07
8 | 20180201,115662,200000010000008,1,7983,1,10000.00 ,S,62259910005008,11000002,其他代码,11/17/2018,08
9 | 20180201,115663,200000010000009,1,7983,1,10000.00 ,S,62259910005009,11000003,其他代码,11/18/2018,09
10 |
11 |
12 | 注意:有中文编码是gbk,可以再建hive表时指定编码,这样spark读取出来就不会乱码
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 代码结构
2 |
3 | 1. textProject 写入kafka是一行数据,写入ignite的也是一行数据,此场景把ignite当初分布式内存数据库使用
4 |
5 | 2. objectProject 写入kafka是一个对象,写入ignite的也是一个对象,此场景把ignite当作Key-Value分布式存储系统使用,目前生产环境采用的是这种方式
6 |
7 | # 需求
8 |
9 | 1. 往kafka高性能生产数据,总结producer的优化
10 |
11 | 2. 总结kafka brokers的优化,从存储/复制线程等方面
12 |
13 | 3. spark streaming多线程高性能消费kafka数据,总结消费优化以及spark streaming优化
14 |
15 | 4. structured streaming多线程高性能消费kafka数据,和上面比较性能,还未测试
16 |
17 | # 数据集
18 |
19 | 银行交易数据,300G
20 |
21 | # 性能测试环境
22 |
23 | 6台计算节点,200 cores,800G memory
24 |
25 | # kafka参数调优
26 |
27 | [KafkaTuning.md](./KafkaTuning.md)
28 |
29 | # spark参数调优
30 |
31 | [spark参数调优](./bin/pef.sh)
32 |
33 | # 功能测试结果
34 |
35 | [FunctionTestResult](./FunctionTestResult.md)
36 |
37 | # 性能测试结果
38 |
39 | [PerformanceTestResult](./PerformanceTestResult.md)
40 |
41 |
42 | 问题:平均写入tps只能达到1万左右,而且写入数据量越多越慢,初步分析是三个联合主键导致,将主键改为单个主键的情况,关闭事务,tps可以达到6万左右
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/src/main/scala/common/kryoSerializer.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import java.io.ByteArrayOutputStream
4 |
5 | import com.esotericsoftware.kryo.Kryo
6 | import com.esotericsoftware.kryo.io.{Input, Output}
7 |
8 | object kryoSerializer {
9 |
10 | def setSerializationObjectByKryo(ob: Object):Array[Byte] = {
11 |
12 | var by = new ByteArrayOutputStream()
13 | var output = new Output(by)
14 | try {
15 | val kryo = new Kryo()
16 | kryo.writeObject(output, ob)
17 | output.close()
18 | }catch {
19 | case ex:Any => {
20 | ex.printStackTrace()
21 | }
22 | }
23 | by.toByteArray
24 |
25 | }
26 |
27 | def getSerializationObjectByKryo(bytes: Array[Byte]) = {
28 |
29 | var input = new Input(bytes)
30 | var event:eventRow = null
31 |
32 | try {
33 | val kryo = new Kryo()
34 | event = kryo.readObject(input,classOf[eventRow])
35 | input.close()
36 | }catch {
37 | case ex:Any => {
38 | ex.printStackTrace()
39 | }
40 | }
41 |
42 | event
43 |
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/assembly/bin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | tar.gz
4 |
5 |
12 |
13 |
14 | smokeData
15 | smokeData
16 |
17 |
18 | configFile
19 | configFile
20 |
21 |
22 | bin
23 | bin
24 | 0755
25 | 0755
26 |
27 | *.sh
28 |
29 | unix
30 |
31 |
32 |
33 | target
34 | lib
35 | 0644
36 |
37 | ${project.name}-${project.version}.jar
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/src/main/scala/common/eventRow.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import java.io.ByteArrayOutputStream
4 |
5 | import com.esotericsoftware.kryo.Kryo
6 | import com.esotericsoftware.kryo.io.{Input, Output}
7 |
8 | case class eventRow(
9 | jioyrq: String,
10 | jioysj: String,
11 | guiyls: String,
12 | cpznxh: String,
13 | jiaoym: String,
14 | jiedbz: String,
15 | jio1je: String,
16 | kemucc: String,
17 | kehuzh: String,
18 | kehhao: String,
19 | zhyodm: String,
20 | hmjsjc: String,
21 | huobdh: String
22 | ) {
23 | def setSerializationObjectByKryo(ob: Object): Array[Byte] = {
24 |
25 | var by = new ByteArrayOutputStream()
26 | var output = new Output(by)
27 | try {
28 | val kryo = new Kryo()
29 | kryo.writeObject(output, ob)
30 | output.close()
31 | }catch {
32 | case ex:Any => {
33 | ex.printStackTrace()
34 | }
35 | }
36 | by.toByteArray
37 |
38 | }
39 |
40 | def getSerializationObjectByKryo(bytes: Array[Byte]) = {
41 |
42 | var input = new Input(bytes)
43 | var event: eventRow = null
44 |
45 | try {
46 | val kryo = new Kryo()
47 | event = kryo.readObject(input, classOf[eventRow])
48 | input.close()
49 | }catch {
50 | case ex:Any => {
51 | ex.printStackTrace()
52 | }
53 | }
54 |
55 | event
56 |
57 | }
58 | }
59 |
60 |
61 |
--------------------------------------------------------------------------------
/src/main/scala/objectProject/structuredStreamingKafkaToIgnitePerformance.scala:
--------------------------------------------------------------------------------
1 | package objectProject
2 |
3 |
4 | import com.beust.jcommander.JCommander
5 | import org.apache.log4j.Logger
6 | import org.apache.spark.sql.SparkSession
7 | import common.{Args, igniteWriter}
8 |
9 | class structuredStreamingKafkaToIgnitePerformance {
10 |
11 | }
12 |
13 | object structuredStreamingKafkaToIgnitePerformance {
14 |
15 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance])
16 |
17 | def main(args: Array[String]): Unit = {
18 |
19 | /**
20 | * 获取输入参数与定义全局变量
21 | */
22 |
23 | log.info("获取输入变量")
24 | val argv = new Args()
25 | JCommander.newBuilder().addObject(argv).build().parse(args: _*)
26 |
27 | /**
28 | * 创建source/dest context
29 | */
30 | log.info("初始sparkcontext")
31 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate()
32 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args]))
33 |
34 | val kafkaParams = Map[String, String](
35 | "subscribe" -> argv.topic,
36 | "kafka.bootstrap.servers" -> argv.brokers,
37 | "group.id" -> argv.groupid,
38 | "auto.offset.reset" -> "latest",
39 | "session.timeout.ms" -> "30000"
40 | )
41 |
42 | val records = spark.readStream.format("kafka").options(kafkaParams)
43 | .option("enable.auto.commit", (false: java.lang.Boolean))
44 | .option("checkpointLocation", "/tmp/structuredStreaming")
45 | .load()
46 |
47 | /**
48 | * 开始处理数据
49 | */
50 |
51 | val recordsVlues = records.selectExpr("CAST(value AS STRING)")
52 |
53 | val igniteJdbc = "jdbc:ignite:cfg://file://" + argv.igniteconfxml
54 | recordsVlues.writeStream.foreach(new igniteWriter(igniteJdbc)).outputMode("append").start().awaitTermination()
55 |
56 | }
57 |
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/src/main/scala/textProject/structuredStreamingKafkaToIgnitePerformance.scala:
--------------------------------------------------------------------------------
1 | package textProject
2 |
3 |
4 | import com.beust.jcommander.JCommander
5 | import org.apache.log4j.Logger
6 | import org.apache.spark.sql.SparkSession
7 | import common.{Args, igniteWriter}
8 |
9 | class structuredStreamingKafkaToIgnitePerformance {
10 |
11 | }
12 |
13 | object structuredStreamingKafkaToIgnitePerformance {
14 |
15 | private val log = Logger.getLogger(classOf[structuredStreamingKafkaToIgnitePerformance])
16 |
17 | def main(args: Array[String]): Unit = {
18 |
19 | /**
20 | * 获取输入参数与定义全局变量
21 | */
22 |
23 | log.info("获取输入变量")
24 | val argv = new Args()
25 | JCommander.newBuilder().addObject(argv).build().parse(args: _*)
26 |
27 | /**
28 | * 创建source/dest context
29 | */
30 | log.info("初始sparkcontext")
31 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate()
32 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args]))
33 |
34 | val kafkaParams = Map[String, String](
35 | "subscribe" -> argv.topic,
36 | "kafka.bootstrap.servers" -> argv.brokers,
37 | "group.id" -> argv.groupid,
38 | "auto.offset.reset" -> "latest",
39 | "session.timeout.ms" -> "30000"
40 | )
41 |
42 | val records = spark.readStream.format("kafka").options(kafkaParams)
43 | .option("enable.auto.commit", (false: java.lang.Boolean))
44 | .option("checkpointLocation", "/tmp/structuredStreaming")
45 | .load()
46 |
47 | /**
48 | * 开始处理数据
49 | */
50 |
51 | val recordsVlues = records.selectExpr("CAST(value AS STRING)")
52 |
53 | val igniteJdbc = "jdbc:ignite:cfg://file://" + argv.igniteconfxml
54 | recordsVlues.writeStream.foreach(new igniteWriter(igniteJdbc)).outputMode("append").start().awaitTermination()
55 |
56 | }
57 |
58 | }
59 |
60 |
--------------------------------------------------------------------------------
/src/main/scala/common/KafkaSink.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 |
4 | import java.util.concurrent.Future
5 |
6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
7 | import org.apache.kafka.common.errors.InterruptException
8 |
9 | class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
10 | /* This is the key idea that allows us to work around running into
11 | NotSerializableExceptions. */
12 | lazy val producer = createProducer()
13 |
14 | def send(topic: String, key: K, value: V): Future[RecordMetadata] =
15 | producer.send(new ProducerRecord[K, V](topic, key, value))
16 |
17 | def send(topic: String, value: V): Future[RecordMetadata] =
18 | producer.send(new ProducerRecord[K, V](topic, value))
19 |
20 | def send(topic: String, partitionNum: Integer, key: K, value: V) =
21 | producer.send(new ProducerRecord[K,V](topic, partitionNum, key, value))
22 |
23 | def send(topic: String, partitionNum: Integer, key: K, value: V, callback: mycallback) =
24 | try {
25 | producer.send(new ProducerRecord[K, V](topic, partitionNum, key, value), new mycallback).get()
26 | } catch {
27 | case ex: InterruptException => print("inter")
28 | }
29 | }
30 |
31 | object KafkaSink {
32 |
33 | import scala.collection.JavaConversions._
34 |
35 | def apply[K, V](config: Map[String, Object]): KafkaSink[String, Object] = {
36 | val createProducerFunc = () => {
37 | val producer = new KafkaProducer[String, Object](config)
38 | sys.addShutdownHook {
39 | // Ensure that, on executor JVM shutdown, the Kafka producer sends
40 | // any buffered messages to Kafka before shutting down.
41 | producer.close()
42 | }
43 | producer
44 | }
45 | new KafkaSink(createProducerFunc)
46 | }
47 |
48 | def apply[K, V](config: java.util.Properties): KafkaSink[String, Object] = apply(config.toMap)
49 | }
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/bin/pef.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ## 1,正常运行,采用默认参数
4 |
5 | spark2-submit \
6 | --executor-memory 8G --executor-cores 4 --num-executors 10 \
7 | --class dataImportKafkaPerformance \
8 | --master yarn \
9 | --deploy-mode client \
10 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
11 | -cachename yc \
12 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
13 | -brokers datanode1:9092 \
14 | -partitionNum 3 \
15 | -groupid yc \
16 | -hiveTableName default.mm \
17 | -topic yc \
18 | -appName kafkainput
19 |
20 |
21 | ## 2,增加参数并发以及executor优化
22 |
23 | spark2-submit \
24 | --executor-memory 8G --executor-cores 4 --num-executors 10 \
25 | --conf spark.default.parallelism=1000 \
26 | --conf spark.storage.memoryFraction=0.5 \
27 | --conf spark.shuffle.memoryFraction=0.3 \
28 | --class dataImportKafkaPerformance \
29 | --master yarn \
30 | --deploy-mode client \
31 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
32 | -cachename yc \
33 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
34 | -brokers datanode1:9092 \
35 | -partitionNum 3 \
36 | -groupid yc \
37 | -hiveTableName default.mm \
38 | -topic yc \
39 | -appName kafkainput
40 |
41 | ## 3.增加GC的优化
42 |
43 | spark2-submit \
44 | --executor-memory 8G --executor-cores 4 --num-executors 10 \
45 | --conf spark.default.parallelism=1000 \
46 | --conf spark.storage.memoryFraction=0.4 \
47 | --conf spark.shuffle.memoryFraction=0.2 \
48 | --conf spark.executor.extraJavaOptions="-XX:MaxGCPauseMillis=100 -XX:ParallelGCThreads=8 -XX:ConcGCThreads=2 -XX:+UseG1GC "
49 | --class dataImportKafkaPerformance \
50 | --master yarn \
51 | --deploy-mode client \
52 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
53 | -cachename yc \
54 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
55 | -brokers datanode1:9092 \
56 | -partitionNum 3 \
57 | -groupid yc \
58 | -hiveTableName default.mm \
59 | -topic yc \
60 | -appName kafkainput
61 |
--------------------------------------------------------------------------------
/src/main/scala/common/Args.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import com.beust.jcommander.Parameter
4 |
5 | class Args extends Serializable {
6 |
7 | @Parameter(names = Array("-appName"), required = true) var appName: String = null
8 |
9 | @Parameter(names = Array("-igniteconfxml"), required = true) var igniteconfxml: String = null
10 |
11 | @Parameter(names = Array("-cachename"), required = true) var cachename: String = null
12 |
13 | @Parameter(names = Array("-partitionNum"), required = true) var partitionNum: Integer = null
14 |
15 | @Parameter(names = Array("-brokers"), required = true) var brokers: String = null
16 |
17 | @Parameter(names = Array("-groupid"), required = true) var groupid: String = null
18 |
19 | @Parameter(names = Array("-topic"), required = true) var topic: String = null
20 |
21 | @Parameter(names = Array("-hiveTableName"), required = true) var hiveTableName: String = null
22 |
23 | @Parameter(names = Array("-topicCompression"), required = false) var topicCompression: String = "snappy"
24 |
25 | @Parameter(names = Array("-bufferMem"), required = false) var bufferMem: String = "33554432"
26 |
27 | @Parameter(names = Array("-lingerMs"), required = false) var lingerMs: String = "0"
28 |
29 | @Parameter(names = Array("-retries"), required = false) var retries: String = "0"
30 |
31 | @Parameter(names = Array("-durationTime"), required = false) var durationTime: Int = 500
32 |
33 | @Parameter(names = Array("-perConnection"), required = false) var perConnection: String = "1"
34 |
35 | @Parameter(names = Array("-batchSize"), required = false) var batchSize: String = "65536"
36 |
37 | @Parameter(names = Array("-allowOverwrite"), required = false) var allowOverwrite: Boolean = true
38 |
39 | @Parameter(names = Array("-primaryKey"), required = false) var primaryKey: String = "guiyls,kehhao,jioysj"
40 |
41 | @Parameter(names = Array("-writeMode"), required = false) var writeMode: String = "Append"
42 |
43 | @Parameter(names = Array("-tableParameters"), required = false) var tableParameters: String = "BACKUPS=1, ATOMICITY=TRANSACTIONAL, CACHE_NAME=yc, DATA_REGION=Default_Region"
44 |
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/common/igniteWriter.scala:
--------------------------------------------------------------------------------
1 | package common
2 |
3 | import java.sql.{Connection, DriverManager}
4 |
5 | import com.sun.mail.iap.ConnectionException
6 | import org.apache.spark.sql.{ForeachWriter, Row}
7 |
8 | class igniteWriter(igniteJdbc: String) extends ForeachWriter[Row] {
9 |
10 |
11 | //"jdbc:ignite:cfg://file:///etc/config/ignite-jdbc.xml"
12 | //"INSERT INTO Person(_key, name, age) VALUES(CAST(? as BIGINT), ?, ?)"
13 |
14 | var connection: Connection = null
15 | Class.forName("org.apache.ignite.IgniteJdbcDriver")
16 |
17 | override def open(partitionId: Long, version: Long): Boolean = {
18 |
19 | try {
20 | connection = DriverManager.getConnection(igniteJdbc)
21 | } catch {
22 | case ex: ConnectionException => {
23 | ex.printStackTrace()
24 | println("连接ignite错误:"+igniteJdbc)
25 | }
26 | }
27 |
28 | true
29 | }
30 |
31 | override def process(value: Row): Unit = {
32 | val stmt = connection.prepareStatement("MERGE INTO yc(jioyrq,jioysj,guiyls,cpznxh,jiaoym,jiedbz,jio1je,kemucc,kehuzh,kehhao," +
33 | "zhyodm,hmjsjc,huobdh) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)")
34 | val tmp:eventRow = eventRow(value.mkString(",")(0).toString.replace("(",""),value.mkString(",")(0).toString,
35 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,
36 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,
37 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString)
38 | stmt.setString(1,tmp.jioyrq)
39 | stmt.setString(2,tmp.jioysj)
40 | stmt.setString(3,tmp.guiyls)
41 | stmt.setString(4,tmp.cpznxh)
42 | stmt.setString(5,tmp.jiaoym)
43 | stmt.setString(6,tmp.jiedbz)
44 | stmt.setString(7,tmp.jio1je)
45 | stmt.setString(8,tmp.kemucc)
46 | stmt.setString(9,tmp.kehuzh)
47 | stmt.setString(10,tmp.kehhao)
48 | stmt.setString(11,tmp.zhyodm)
49 | stmt.setString(12,tmp.hmjsjc)
50 | stmt.setString(13,tmp.huobdh)
51 | stmt.execute()
52 | }
53 |
54 | override def close(errorOrNull: Throwable): Unit = connection.close()
55 | }
56 |
--------------------------------------------------------------------------------
/PerformanceTestResult.md:
--------------------------------------------------------------------------------
1 | # 数据导入kafka的性能测试
2 |
3 | 数据集:331.3G
4 |
5 | 数据行数:
6 |
7 | ## 1,正常运行,采用默认参数
8 |
9 | spark2-submit \
10 | --executor-memory 8G --executor-cores 4 --num-executors 10 \
11 | --class textProject.dataImportKafkaPerformance \
12 | --master yarn \
13 | --deploy-mode client \
14 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
15 | -cachename yc \
16 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
17 | -brokers datanode1:9092 \
18 | -partitionNum 44 \
19 | -groupid yc \
20 | -hiveTableName default.mm \
21 | -topic yc \
22 | -appName kafkainput
23 |
24 | |类别|值|
25 | |:---|:---|
26 | |总时间|10.7分钟 |
27 |
28 | ## 2,增加参数并发以及executor优化
29 |
30 | spark2-submit \
31 | --executor-memory 8G --executor-cores 4 --num-executors 10 \
32 | --conf spark.default.parallelism=80 \
33 | --conf spark.storage.memoryFraction=0.5 \
34 | --conf spark.shuffle.memoryFraction=0.3 \
35 | --class textProject.dataImportKafkaPerformance \
36 | --master yarn \
37 | --deploy-mode client \
38 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
39 | -cachename yc \
40 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
41 | -brokers datanode1:9092 \
42 | -partitionNum 44 \
43 | -groupid yc \
44 | -hiveTableName default.mm \
45 | -topic yc \
46 | -appName kafkainput
47 |
48 | |类别|值|
49 | |:---|:---|
50 | |总时间|10.1分钟 |
51 |
52 |
53 | ## 3,减少参数,优化executor的数量
54 |
55 | spark2-submit \
56 | --executor-memory 8G --executor-cores 4 --num-executors 30 \
57 | --conf spark.default.parallelism=240 \
58 | --class textProject.dataImportKafkaPerformance \
59 | --master yarn \
60 | --deploy-mode client \
61 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
62 | -cachename yc \
63 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
64 | -brokers datanode1:9092 \
65 | -partitionNum 44 \
66 | -groupid yc \
67 | -hiveTableName default.mm \
68 | -topic yc \
69 | -appName kafkainput
70 |
71 | |类别|值|
72 | |:---|:---|
73 | |总时间|9.8分钟 |
74 |
75 |
76 | ## 4.增加GC的优化
77 |
78 | spark2-submit \
79 | --executor-memory 8G --executor-cores 4 --num-executors 30 \
80 | --conf spark.default.parallelism=360 \
81 | --conf spark.executor.extraJavaOptions="-XX:MaxGCPauseMillis=100 -XX:ParallelGCThreads=8 -XX:ConcGCThreads=2 -XX:+UseG1GC "
82 | --class textProject.dataImportKafkaPerformance \
83 | --master yarn \
84 | --deploy-mode client \
85 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
86 | -cachename yc \
87 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
88 | -brokers datanode1:9092 \
89 | -partitionNum 44 \
90 | -groupid yc \
91 | -hiveTableName default.mm \
92 | -topic yc \
93 | -appName kafkainput
94 |
95 | |类别|值|
96 | |:---|:---|
97 | |总时间|10.2分钟 |
98 |
99 | 参数要合理,参数不合理反而影响性能
100 |
--------------------------------------------------------------------------------
/src/main/scala/objectProject/streamingKafkaToIgnitePerformance.scala:
--------------------------------------------------------------------------------
1 | package objectProject
2 |
3 | import com.beust.jcommander.JCommander
4 | import org.apache.ignite.spark.IgniteDataFrameSettings._
5 | import org.apache.kafka.common.serialization.{ByteArrayDeserializer, StringDeserializer}
6 | import org.apache.log4j.Logger
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
10 | import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
11 | import org.apache.spark.streaming.{Duration, StreamingContext}
12 | import common.{Args, eventRow, kryoSerializer}
13 |
14 | class streamingKafkaToIgnitePerformance {
15 |
16 | }
17 |
18 | object streamingKafkaToIgnitePerformance {
19 |
20 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance])
21 |
22 | def main(args: Array[String]): Unit = {
23 |
24 | /**
25 | * 获取输入参数与定义全局变量
26 | */
27 |
28 | log.info("获取输入变量")
29 | val argv = new Args()
30 | JCommander.newBuilder().addObject(argv).build().parse(args: _*)
31 |
32 | /**
33 | * 创建source/dest context
34 | */
35 | log.info("初始sparkcontext和kuducontext")
36 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate()
37 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow]))
38 |
39 | val ssc = new StreamingContext(spark.sparkContext, Duration(argv.durationTime))
40 | ssc.checkpoint("/tmp/streamingToIgnite")
41 |
42 | /**
43 | * 初始化igniteContext
44 | */
45 | /*
46 | log.info("========================================== 初始化ignite ==========================================")
47 | val igniteContext = new IgniteContext(spark.sparkContext, argv.igniteconfxml, true)
48 | val fromCache: IgniteRDD[String, String] = igniteContext.fromCache(argv.cachename)
49 | */
50 |
51 | /**
52 | * 创建多线程kafka数据流
53 | */
54 | log.info("初始化kafka数据流")
55 | val kafkaParams = Map[String, Object](
56 | "bootstrap.servers" -> argv.brokers,
57 | "key.deserializer" -> classOf[StringDeserializer],
58 | "value.deserializer" -> classOf[ByteArrayDeserializer],
59 | "group.id" -> argv.groupid,
60 | "auto.offset.reset" -> "latest",
61 | "session.timeout.ms" -> "30000",
62 | "enable.auto.commit" -> (false: java.lang.Boolean)
63 | )
64 | val topics = Array(argv.topic)
65 |
66 | val stream = KafkaUtils.createDirectStream[String, Array[Byte]](ssc, PreferConsistent, Subscribe[String, Array[Byte]](topics, kafkaParams))
67 |
68 | /**
69 | * 开始处理数据
70 | */
71 | log.info("开始处理数据")
72 |
73 | var offsetRanges = Array[OffsetRange]()
74 |
75 | stream.foreachRDD(rdd => {
76 |
77 |
78 | offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
79 |
80 | /**
81 | * 仅测试,输出offset, key, value
82 | *
83 | * 如果要存储offset也可以用同样的方法去做
84 | */
85 | /*
86 | for (record <- rdd) {
87 | System.out.printf("offset = %d, key = %s, value = %s\n",
88 | record.offset(), record.key(), record.value());
89 | }
90 | */
91 |
92 | val valueRDD = rdd.map(x=>(x.key(),kryoSerializer.getSerializationObjectByKryo(x.value())))
93 |
94 | log.info("开始写入ignite")
95 |
96 | import spark.implicits._
97 | val df = valueRDD.toDF()
98 |
99 | df.write
100 | .format(FORMAT_IGNITE)
101 | .option(OPTION_CONFIG_FILE, argv.igniteconfxml)
102 | .option(OPTION_TABLE, argv.cachename)
103 | .mode(argv.writeMode)
104 | .option(OPTION_STREAMER_ALLOW_OVERWRITE, argv.allowOverwrite)
105 | .option(OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS, argv.primaryKey)
106 | .option(OPTION_CREATE_TABLE_PARAMETERS, argv.tableParameters)
107 | .save()
108 |
109 | stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
110 |
111 | })
112 |
113 | // TODO 判断流是否为空,如果为空则不提交任务,节省调度时间
114 | ssc.start()
115 | ssc.awaitTermination()
116 |
117 | }
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/scala/textProject/streamingKafkaToIgnitePerformance.scala:
--------------------------------------------------------------------------------
1 | package textProject
2 |
3 | import com.beust.jcommander.JCommander
4 | import org.apache.ignite.spark.IgniteDataFrameSettings._
5 | import org.apache.kafka.common.serialization.StringDeserializer
6 | import org.apache.log4j.Logger
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
10 | import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
11 | import org.apache.spark.streaming.{Duration, StreamingContext}
12 | import common.{Args, eventRow}
13 |
14 | class streamingKafkaToIgnitePerformance {
15 |
16 | }
17 |
18 | object streamingKafkaToIgnitePerformance {
19 |
20 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance])
21 |
22 | def main(args: Array[String]): Unit = {
23 |
24 | /**
25 | * 获取输入参数与定义全局变量
26 | */
27 |
28 | log.info("获取输入变量")
29 | val argv = new Args()
30 | JCommander.newBuilder().addObject(argv).build().parse(args: _*)
31 |
32 | /**
33 | * 创建source/dest context
34 | */
35 | log.info("初始sparkcontext和kuducontext")
36 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate()
37 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow]))
38 |
39 | val ssc = new StreamingContext(spark.sparkContext, Duration(argv.durationTime))
40 | ssc.checkpoint("/tmp/streamingToIgnite")
41 |
42 | /**
43 | * 初始化igniteContext
44 | */
45 | /*
46 | log.info("========================================== 初始化ignite ==========================================")
47 | val igniteContext = new IgniteContext(spark.sparkContext, argv.igniteconfxml, true)
48 | val fromCache: IgniteRDD[String, String] = igniteContext.fromCache(argv.cachename)
49 | */
50 |
51 | /**
52 | * 创建多线程kafka数据流
53 | */
54 | log.info("初始化kafka数据流")
55 | val kafkaParams = Map[String, Object](
56 | "bootstrap.servers" -> argv.brokers,
57 | "key.deserializer" -> classOf[StringDeserializer],
58 | "value.deserializer" -> classOf[StringDeserializer],
59 | "group.id" -> argv.groupid,
60 | "auto.offset.reset" -> "latest",
61 | "session.timeout.ms" -> "30000",
62 | "enable.auto.commit" -> (false: java.lang.Boolean)
63 | )
64 | val topics = Array(argv.topic)
65 |
66 | val stream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
67 |
68 | /**
69 | * 开始处理数据
70 | */
71 | log.info("开始处理数据")
72 |
73 | var offsetRanges = Array[OffsetRange]()
74 |
75 | stream.foreachRDD(rdd => {
76 |
77 |
78 | offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
79 |
80 | /**
81 | * 仅测试,输出offset, key, value
82 | *
83 | * 如果要存储offset也可以用同样的方法去做
84 | */
85 | /*
86 | for (record <- rdd) {
87 | System.out.printf("offset = %d, key = %s, value = %s\n",
88 | record.offset(), record.key(), record.value());
89 | }
90 | */
91 |
92 | val valueRDD = rdd.map(_.value().split(","))
93 |
94 | log.info("开始写入ignite")
95 |
96 | import spark.implicits._
97 | val df = valueRDD.map(x => eventRow(x(0).replace("(", ""), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9),
98 | x(10), x(11), x(12).replace(")", ""))).toDF()
99 |
100 | df.write
101 | .format(FORMAT_IGNITE)
102 | .option(OPTION_CONFIG_FILE, argv.igniteconfxml)
103 | .option(OPTION_TABLE, argv.cachename)
104 | .mode(argv.writeMode)
105 | .option(OPTION_STREAMER_ALLOW_OVERWRITE, argv.allowOverwrite)
106 | .option(OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS, argv.primaryKey)
107 | .option(OPTION_CREATE_TABLE_PARAMETERS, argv.tableParameters)
108 | .save()
109 |
110 | stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
111 |
112 | })
113 |
114 | // TODO 判断流是否为空,如果为空则不提交任务,节省调度时间
115 | ssc.start()
116 | ssc.awaitTermination()
117 |
118 | }
119 |
120 | }
121 |
--------------------------------------------------------------------------------
/KafkaTuning.md:
--------------------------------------------------------------------------------
1 | # producer tuning
2 |
3 | Most important configurations which needs to be taken care at Producer side are:
4 |
5 | **1. Compression**
6 |
7 | **2. Batch size**
8 |
9 | **3. Sync or Async**
10 |
11 | 主要是如下参数:
12 |
13 | batch.size: 基于大小的batching策略
14 | linger.ms: 基于时间的batching策略
15 | compression.type:压缩的速度上lz4=snappy
97 | (newRow(0).toString, if ((!(newRow(1).toString).equals(""))) newRow(1).toString else "0",
98 | newRow(2).toString, if (!((newRow(3).toString).equals(""))) newRow(3).toString else "0", newRow(4).toString,
99 | newRow(5).toString, if (!((newRow(6).toString).equals(""))) newRow(6).toString else "0", newRow(7).toString, newRow(8).toString,
100 | newRow(9).toString, newRow(10).toString, newRow(11).toString, newRow(12).toString)
101 | )
102 |
103 | /**
104 | * 进行二次排序
105 | */
106 | log.info("========================================== 开始二次排序 ==========================================")
107 |
108 | import org.apache.spark.sql._
109 | val sortFilterTableDF = filterTableDF.repartition(argv.partitionNum,new Column("_10")).sortWithinPartitions("_12")
110 |
111 | /**
112 | * 然后调用foreatchPartition写入对应的分区,这里是否需要自定义partitioner?
113 | */
114 |
115 | log.info("========================================== 开始写入kafka ==========================================")
116 | /*
117 | sortFilterTableDF.rdd.mapPartitions(rows => {
118 | log.info("========================================== kafka 1 ==========================================")
119 | rows.map(row => {
120 | val kafkaPartition: Int = row.kehhao.toInt % argv.partitionNum
121 | log.info("kafkaPartition===============" + kafkaPartition)
122 | kafkaProducer.value.send(argv.topic, kafkaPartition ,row.kehhao.toString, row.toString)
123 | })
124 | }).collect()
125 | */
126 |
127 | sortFilterTableDF.foreachPartition(rows=>{
128 | while (rows.hasNext){
129 | val tmp = rows.next()
130 | var kafkaPartition = 0
131 | try {
132 | kafkaPartition = tmp._10.trim.toInt % argv.partitionNum
133 | }catch{
134 | case ex: NumberFormatException =>{
135 | println(ex.getMessage)
136 | log.warn("异常数据:"+tmp.toString())
137 | }
138 | case ex: Any => {
139 | println("Unkown error!!")
140 | }
141 | }
142 | //log.info("kafkaPartition===============" + kafkaPartition)
143 | kafkaProducer.value.send(argv.topic, kafkaPartition ,tmp._10.toString, tmp.toString())
144 | }
145 | })
146 |
147 | kafkaProducer.value.producer.flush()
148 | kafkaProducer.value.producer.close()
149 |
150 | spark.close()
151 | }
152 |
153 | }
154 |
--------------------------------------------------------------------------------
/src/main/scala/objectProject/dataImportKafkaPerformance.scala:
--------------------------------------------------------------------------------
1 | package objectProject
2 |
3 | import java.util.Properties
4 |
5 | import com.beust.jcommander.JCommander
6 | import org.apache.kafka.common.serialization.{ByteArraySerializer, StringSerializer}
7 | import org.apache.log4j.Logger
8 | import org.apache.spark.broadcast.Broadcast
9 | import org.apache.spark.sql.SparkSession
10 | import common.{Args, KafkaSink, eventRow, kryoSerializer}
11 |
12 | class dataImportKafkaPerformance() {
13 |
14 | }
15 |
16 | object dataImportKafkaPerformance {
17 |
18 | private val log = Logger.getLogger(classOf[dataImportKafkaPerformance])
19 | val sTime: Long = System.currentTimeMillis
20 |
21 | def main(args: Array[String]): Unit = {
22 |
23 | dataImportKafka(args)
24 |
25 | }
26 |
27 | def dataImportKafka(args: Array[String]): Unit = {
28 | //获取传入参数
29 | log.info("========================================== 初始化jcommander ==========================================")
30 | val argv = new Args()
31 | JCommander.newBuilder().addObject(argv).build().parse(args: _*)
32 |
33 | //创建sparksession
34 | val spark = SparkSession
35 | .builder()
36 | .appName(argv.appName)
37 | .enableHiveSupport()
38 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
39 | .getOrCreate()
40 |
41 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow]))
42 |
43 | import spark.implicits._
44 |
45 | /**
46 | * only used for test
47 | */
48 | //log.warn("打印出所有的配置项,供优化参考: \n" + spark.conf.getAll)
49 |
50 | log.info("========================================== 初始化kafka producer ==========================================")
51 | val kafkaProducer: Broadcast[KafkaSink[String, Object]] = {
52 | val kafkaProducerConfig = {
53 | val p = new Properties()
54 | p.setProperty("bootstrap.servers", argv.brokers)
55 | p.setProperty("acks", "all")
56 | p.setProperty("max.in.flight.requests.per.connection", argv.perConnection)
57 | p.setProperty("batch.size", argv.batchSize)
58 | p.setProperty("retries", argv.retries)
59 | p.setProperty("linger.ms", argv.lingerMs)
60 | p.setProperty("buffer.memory", argv.bufferMem)
61 | p.setProperty("compression.type", argv.topicCompression)
62 | p.setProperty("key.serializer", classOf[StringSerializer].getName)
63 | p.setProperty("value.serializer", classOf[ByteArraySerializer].getName)
64 | p
65 | }
66 | log.warn("kafka producer init done!")
67 | spark.sparkContext.broadcast(KafkaSink[String, Object](kafkaProducerConfig))
68 | }
69 |
70 | /**
71 | * read data from hive
72 | */
73 |
74 | val tableDF = spark.table(argv.hiveTableName).select(
75 | "jioyrq",
76 | "jioysj",
77 | "guiyls",
78 | "cpznxh",
79 | "jiaoym",
80 | "jiedbz",
81 | "jio1je",
82 | "kemucc",
83 | "kehuzh",
84 | "kehhao",
85 | "zhyodm",
86 | "hmjsjc",
87 | "huobdh")
88 |
89 | /**
90 | * 一行读取出来,然后判断一行中部分字段是否有业务逻辑问题,如有则记录,发送到error kafka topic中
91 | *
92 | * 输入的是df,然后需要对每一行的每一个字段进行逻辑判断,满足要求则直接取值,不满足要求则改变其值,然后返回一行新的row,最后返回一个新的df
93 | */
94 |
95 | log.info("========================================== 开始转换df ==========================================")
96 | val filterTableDF = tableDF.map(newRow =>
97 | (eventRow(newRow(0).toString,if ((!(newRow(1).toString).equals(""))) newRow(1).toString else "0",
98 | newRow(2).toString, if (!((newRow(3).toString).equals(""))) newRow(3).toString else "0", newRow(4).toString,
99 | newRow(5).toString, if (!((newRow(6).toString).equals(""))) newRow(6).toString else "0", newRow(7).toString, newRow(8).toString,
100 | newRow(9).toString, newRow(10).toString, newRow(11).toString, newRow(12).toString))
101 | )
102 |
103 | /**
104 | * 进行二次排序
105 | */
106 | log.info("========================================== 开始二次排序 ==========================================")
107 |
108 | import org.apache.spark.sql._
109 | val sortFilterTableDF = filterTableDF.repartition(argv.partitionNum,new Column("kehhao")).sortWithinPartitions("huobdh")
110 |
111 | /**
112 | * 然后调用foreatchPartition写入对应的分区,这里是否需要自定义partitioner?
113 | */
114 |
115 | log.info("========================================== 开始写入kafka ==========================================")
116 | /*
117 | sortFilterTableDF.rdd.mapPartitions(rows => {
118 | log.info("========================================== kafka 1 ==========================================")
119 | rows.map(row => {
120 | val kafkaPartition: Int = row.kehhao.toInt % argv.partitionNum
121 | log.info("kafkaPartition===============" + kafkaPartition)
122 | kafkaProducer.value.send(argv.topic, kafkaPartition ,row.kehhao.toString, row.toString)
123 | })
124 | }).collect()
125 | */
126 |
127 | sortFilterTableDF.foreachPartition(rows=>{
128 | while (rows.hasNext){
129 | val tmp = rows.next()
130 | var kafkaPartition = 0
131 | try {
132 | kafkaPartition = tmp.kehhao.trim.toInt % argv.partitionNum
133 | }catch{
134 | case ex: NumberFormatException =>{
135 | println(ex.getMessage)
136 | log.warn("异常数据:"+tmp.toString())
137 | }
138 | case ex: Any => {
139 | println("Unkown error!!")
140 | }
141 | }
142 | //log.info("kafkaPartition===============" + kafkaPartition)
143 | kafkaProducer.value.send(argv.topic, kafkaPartition ,tmp.kehhao.toString+"_"+tmp.guiyls.toString+"_"+tmp.jioysj.toString,
144 | kryoSerializer.setSerializationObjectByKryo(tmp))
145 | }
146 | })
147 |
148 | kafkaProducer.value.producer.flush()
149 | kafkaProducer.value.producer.close()
150 |
151 | spark.close()
152 | }
153 |
154 | }
155 |
--------------------------------------------------------------------------------
/configFile/ignite-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
75 |
76 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/FunctionTestResult.md:
--------------------------------------------------------------------------------
1 | # 数据导入
2 |
3 | ## 环境准备
4 |
5 | hive表:
6 |
7 | > show create table mm;
8 | OK
9 | CREATE TABLE `mm`(
10 | `jioyrq` string,
11 | `jioysj` string,
12 | `guiyls` string,
13 | `cpznxh` string,
14 | `jiaoym` string,
15 | `jiedbz` string,
16 | `jio1je` string,
17 | `kemucc` string,
18 | `kehuzh` string,
19 | `kehhao` string,
20 | `zhyodm` string,
21 | `hmjsjc` string,
22 | `huobdh` string)
23 | ROW FORMAT SERDE
24 | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
25 | WITH SERDEPROPERTIES (
26 | 'field.delim'=',',
27 | 'serialization.encoding'='GBK')
28 | STORED AS INPUTFORMAT
29 | 'org.apache.hadoop.mapred.TextInputFormat'
30 | OUTPUTFORMAT
31 | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
32 | LOCATION
33 | 'hdfs://namenode1:8020/user/hive/warehouse/mm'
34 | TBLPROPERTIES (
35 | 'COLUMN_STATS_ACCURATE'='false',
36 | 'last_modified_by'='root',
37 | 'last_modified_time'='1521495928',
38 | 'numFiles'='1',
39 | 'numRows'='-1',
40 | 'rawDataSize'='-1',
41 | 'totalSize'='1010',
42 | 'transient_lastDdlTime'='1521495928')
43 | Time taken: 0.198 seconds, Fetched: 34 row(s)
44 |
45 | topic:
46 |
47 | kafka-topics --describe --topic yc --zookeeper datanode1
48 |
49 | Topic:yc PartitionCount:3 ReplicationFactor:1 Configs:
50 | Topic: yc Partition: 0 Leader: 133 Replicas: 133 Isr: 133
51 | Topic: yc Partition: 1 Leader: 131 Replicas: 131 Isr: 131
52 | Topic: yc Partition: 2 Leader: 132 Replicas: 132 Isr: 132
53 |
54 | ## 执行命令:
55 |
56 | 这里简单的测试就不指定资源了,生产环境一定要指定
57 |
58 | spark2-submit \
59 | --class textProject.dataImportKafkaPerformance \
60 | --master yarn \
61 | --deploy-mode client \
62 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
63 | -cachename yc \
64 | -igniteconfxml /opt/ignite/ignite-config-client.xml \
65 | -brokers datanode1:9092 \
66 | -partitionNum 3 \
67 | -groupid yc \
68 | -hiveTableName default.mm \
69 | -topic yc \
70 | -appName kafkainput
71 |
72 |
73 | ## hive表中数据及表结构:
74 |
75 | hive> select * from mm;
76 | OK
77 | 20180201 115655 200000010000001 5 7983 1 10000.00 S 62259910005001 11000001 其他代码 11/10/2018 01
78 | 20180201 115656 200000010000002 5 7983 1 10000.00 S 62259910005002 11000002 其他代码 11/11/2018 02
79 | 20180201 115657 200000010000003 5 7983 1 10000.00 S 62259910005003 11000003 其他代码 11/12/2018 03
80 | 20180201 115658 200000010000004 5 7983 1 10000.00 S 62259910005004 11000001 其他代码 11/13/2018 04
81 | 20180201 115659 200000010000005 5 7983 1 10000.00 S 62259910005005 11000002 其他代码 11/14/2018 05
82 | 20180201 115660 200000010000006 5 7983 1 10000.00 S 62259910005006 11000003 其他代码 11/15/2018 06
83 | 20180201 115661 200000010000007 5 7983 1 10000.00 S 62259910005007 11000001 其他代码 11/16/2018 07
84 | 20180201 115662 200000010000008 5 7983 1 10000.00 S 62259910005008 11000002 其他代码 11/17/2018 08
85 | 20180201 115663 200000010000009 5 7983 1 10000.00 S 62259910005009 11000003 其他代码 11/18/2018 09
86 | Time taken: 1.571 seconds, Fetched: 9 row(s)
87 |
88 | ## 执行结果:
89 |
90 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 0
91 |
92 | 18/03/20 08:25:48 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0
93 | 18/03/20 08:25:48 INFO utils.AppInfoParser: Kafka commitId : unknown
94 | utils.eventRow(20180201,115655,200000010000001,5,7983,1,10000.00 ,S,62259910005001,11000001,其他代码,11/10/2018,01)
95 | utils.eventRow(20180201,115658,200000010000004,5,7983,1,10000.00 ,S,62259910005004,11000001,其他代码,11/13/2018,04)
96 | utils.eventRow(20180201,115661,200000010000007,5,7983,1,10000.00 ,S,62259910005007,11000001,其他代码,11/16/2018,07)
97 |
98 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 1
99 |
100 | 18/03/20 08:25:54 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0
101 | 18/03/20 08:25:54 INFO utils.AppInfoParser: Kafka commitId : unknown
102 | utils.eventRow(20180201,115656,200000010000002,5,7983,1,10000.00 ,S,62259910005002,11000002,其他代码,11/11/2018,02)
103 | utils.eventRow(20180201,115659,200000010000005,5,7983,1,10000.00 ,S,62259910005005,11000002,其他代码,11/14/2018,05)
104 | utils.eventRow(20180201,115662,200000010000008,5,7983,1,10000.00 ,S,62259910005008,11000002,其他代码,11/17/2018,08)
105 |
106 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 2
107 |
108 | 18/03/20 08:25:59 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0
109 | 18/03/20 08:25:59 INFO utils.AppInfoParser: Kafka commitId : unknown
110 | utils.eventRow(20180201,115657,200000010000003,5,7983,1,10000.00 ,S,62259910005003,11000003,其他代码,11/12/2018,03)
111 | utils.eventRow(20180201,115660,200000010000006,5,7983,1,10000.00 ,S,62259910005006,11000003,其他代码,11/15/2018,06)
112 | utils.eventRow(20180201,115663,200000010000009,5,7983,1,10000.00 ,S,62259910005009,11000003,其他代码,11/18/2018,09)
113 |
114 |
115 | # 流式计算
116 |
117 | ## 执行命令
118 |
119 | spark2-submit \
120 | --class textProject.streamingKafkaToIgnitePerformance \
121 | --master yarn \
122 | --deploy-mode client \
123 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \
124 | -cachename yc \
125 | -igniteconfxml /opt/ignite/config/default-config.xml \
126 | -brokers datanode1:9092 \
127 | -partitionNum 3 \
128 | -groupid yc \
129 | -hiveTableName default.mm \
130 | -topic yc \
131 | -appName streamingToIgnite
132 |
133 |
134 | ## ignite写入结果
135 |
136 | 注意:这里只显示出了部分列,应该是ignite的问题,可以直接指定列区查询,就能显示所有
137 |
138 | 0: jdbc:ignite:thin://datanode2/> select * from yc;
139 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+
140 | | JIOYRQ | JIOYSJ | GUIYLS | CPZNXH | JIAOYM | JIEDBZ | |
141 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+
142 | | 20180201 | 115655 | 200000010000001 | 1 | 7983 | 1 | 10000.0 |
143 | | 20180201 | 115662 | 200000010000008 | 1 | 7983 | 1 | 10000.0 |
144 | | 20180201 | 115661 | 200000010000007 | 1 | 7983 | 1 | 10000.0 |
145 | | 20180201 | 115660 | 200000010000006 | 1 | 7983 | 1 | 10000.0 |
146 | | 20180201 | 115658 | 200000010000004 | 1 | 7983 | 1 | 10000.0 |
147 | | 20180201 | 115657 | 200000010000003 | 1 | 7983 | 1 | 10000.0 |
148 | | 20180201 | 115656 | 200000010000002 | 1 | 7983 | 1 | 10000.0 |
149 | | 20180201 | 115663 | 200000010000009 | 1 | 7983 | 1 | 10000.0 |
150 | | 20180201 | 115659 | 200000010000005 | 1 | 7983 | 1 | 10000.0 |
151 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | streamingPerformance
8 | sparkstreamingkafkaperformance
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 | com.beust
14 | jcommander
15 | 1.71
16 |
17 |
18 |
19 | org.apache.spark
20 | spark-streaming-kafka-0-10_2.11
21 | 2.2.1
22 |
23 |
24 |
25 | org.apache.spark
26 | spark-streaming_2.11
27 | 2.2.1
28 | provided
29 |
30 |
31 |
32 | org.apache.hadoop
33 | hadoop-hdfs
34 | 2.6.5
35 |
36 |
37 |
38 | dom4j
39 | dom4j
40 | 1.6.1
41 |
42 |
43 |
44 | junit
45 | junit
46 | 3.8.1
47 | test
48 |
49 |
50 |
51 | org.apache.kafka
52 | kafka-clients
53 | 0.10.0.1
54 |
55 |
56 |
57 | org.apache.ignite
58 | ignite-spark
59 | 2.4.0
60 |
61 |
62 | jdk.tools
63 | jdk.tools
64 |
65 |
66 |
67 |
68 |
69 | org.apache.ignite
70 | ignite-core
71 | 2.4.0
72 |
73 |
74 |
75 |
76 | org.apache.spark
77 | spark-core_2.11
78 | 2.2.1
79 |
80 |
81 |
82 | org.apache.spark
83 | spark-sql_2.11
84 | 2.2.1
85 |
86 |
87 |
88 | org.apache.spark
89 | spark-catalyst_2.11
90 | 2.2.1
91 |
92 |
93 |
94 | org.apache.spark
95 | spark-network-common_2.11
96 | 2.2.1
97 |
98 |
99 |
100 | org.apache.spark
101 | spark-network-shuffle_2.11
102 | 2.2.1
103 |
104 |
105 | org.apache.spark
106 | spark-tags_2.11
107 | 2.2.1
108 |
109 |
110 | org.apache.spark
111 | spark-unsafe_2.11
112 | 2.2.1
113 |
114 |
115 |
116 | org.apache.hadoop
117 | hadoop-common
118 | 2.6.0
119 | provided
120 |
121 |
122 | org.apache.hadoop
123 | hadoop-client
124 | 2.6.0
125 | provided
126 |
127 |
128 |
129 | org.apache.hive
130 | hive-contrib
131 | 1.1.0
132 |
133 |
134 |
135 |
136 | org.apache.curator
137 | curator-recipes
138 | 2.9.1
139 |
140 |
141 | org.apache.curator
142 | curator-client
143 | 2.9.1
144 |
145 |
146 | org.apache.curator
147 | curator-x-discovery
148 | 2.9.1
149 |
150 |
151 |
152 | com.esotericsoftware
153 | kryo
154 | 4.0.2
155 |
156 |
157 |
158 |
159 | src/main/scala
160 | src/test
161 |
162 |
163 |
164 | org.apache.maven.plugins
165 | maven-shade-plugin
166 | 3.0.0
167 |
168 |
169 | package
170 |
171 | shade
172 |
173 |
174 |
175 |
176 | *:*
177 |
178 | META-INF/*.SF
179 | META-INF/*.DSA
180 | META-INF/*.RSA
181 | META-INF/DUMMY.DSA
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 | org.codehaus.mojo
192 | exec-maven-plugin
193 | 1.3.2
194 |
195 |
196 |
197 | exec
198 |
199 |
200 |
201 |
202 | scala
203 | false
204 | false
205 | compile
206 | textProject.dataImportKafkaPerformance
207 |
208 |
209 |
210 |
229 |
230 |
231 | org.scala-tools
232 | maven-scala-plugin
233 | 2.15.2
234 |
235 |
236 |
237 | compile
238 | testCompile
239 |
240 |
241 |
242 |
243 |
244 |
245 | org.apache.maven.plugins
246 | maven-compiler-plugin
247 | 3.1
248 |
249 | 1.8
250 | 1.8
251 |
252 |
253 |
254 | org.apache.maven.plugins
255 | maven-assembly-plugin
256 |
257 | src/assembly/bin.xml
258 | ${project.name}-${project.version}
259 |
260 |
261 |
262 | package
263 |
264 | single
265 |
266 |
267 |
268 |
269 |
270 |
297 |
298 |
299 |
300 |
301 |
--------------------------------------------------------------------------------