├── FunctionTestResult.md ├── KafkaTuning.md ├── PerformanceTestResult.md ├── README.md ├── bin ├── dataImportKafkaPerformance.sh ├── pef.sh ├── sparkstreamingkafkaperformance.sh └── structuredStreamingkafkaperformance.sh ├── build.sbt ├── configFile └── ignite-template.xml ├── pom.xml ├── smokeData └── imputData.md └── src ├── assembly └── bin.xml └── main ├── java └── mycallback.java └── scala ├── common ├── Args.scala ├── KafkaSink.scala ├── eventRow.scala ├── igniteWriter.scala ├── kryoSerializer.scala └── mycallback.scala ├── objectProject ├── dataImportKafkaPerformance.scala ├── streamingKafkaToIgnitePerformance.scala └── structuredStreamingKafkaToIgnitePerformance.scala └── textProject ├── dataImportKafkaPerformance.scala ├── streamingKafkaToIgnitePerformance.scala └── structuredStreamingKafkaToIgnitePerformance.scala /FunctionTestResult.md: -------------------------------------------------------------------------------- 1 | # 数据导入 2 | 3 | ## 环境准备 4 | 5 | hive表: 6 | 7 | > show create table mm; 8 | OK 9 | CREATE TABLE `mm`( 10 | `jioyrq` string, 11 | `jioysj` string, 12 | `guiyls` string, 13 | `cpznxh` string, 14 | `jiaoym` string, 15 | `jiedbz` string, 16 | `jio1je` string, 17 | `kemucc` string, 18 | `kehuzh` string, 19 | `kehhao` string, 20 | `zhyodm` string, 21 | `hmjsjc` string, 22 | `huobdh` string) 23 | ROW FORMAT SERDE 24 | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 25 | WITH SERDEPROPERTIES ( 26 | 'field.delim'=',', 27 | 'serialization.encoding'='GBK') 28 | STORED AS INPUTFORMAT 29 | 'org.apache.hadoop.mapred.TextInputFormat' 30 | OUTPUTFORMAT 31 | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 32 | LOCATION 33 | 'hdfs://namenode1:8020/user/hive/warehouse/mm' 34 | TBLPROPERTIES ( 35 | 'COLUMN_STATS_ACCURATE'='false', 36 | 'last_modified_by'='root', 37 | 'last_modified_time'='1521495928', 38 | 'numFiles'='1', 39 | 'numRows'='-1', 40 | 'rawDataSize'='-1', 41 | 'totalSize'='1010', 42 | 'transient_lastDdlTime'='1521495928') 43 | Time taken: 0.198 seconds, Fetched: 34 row(s) 44 | 45 | topic: 46 | 47 | kafka-topics --describe --topic yc --zookeeper datanode1 48 | 49 | Topic:yc PartitionCount:3 ReplicationFactor:1 Configs: 50 | Topic: yc Partition: 0 Leader: 133 Replicas: 133 Isr: 133 51 | Topic: yc Partition: 1 Leader: 131 Replicas: 131 Isr: 131 52 | Topic: yc Partition: 2 Leader: 132 Replicas: 132 Isr: 132 53 | 54 | ## 执行命令: 55 | 56 | 这里简单的测试就不指定资源了,生产环境一定要指定 57 | 58 | spark2-submit \ 59 | --class textProject.dataImportKafkaPerformance \ 60 | --master yarn \ 61 | --deploy-mode client \ 62 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \ 63 | -cachename yc \ 64 | -igniteconfxml /opt/ignite/ignite-config-client.xml \ 65 | -brokers datanode1:9092 \ 66 | -partitionNum 3 \ 67 | -groupid yc \ 68 | -hiveTableName default.mm \ 69 | -topic yc \ 70 | -appName kafkainput 71 | 72 | 73 | ## hive表中数据及表结构: 74 | 75 | hive> select * from mm; 76 | OK 77 | 20180201 115655 200000010000001 5 7983 1 10000.00 S 62259910005001 11000001 其他代码 11/10/2018 01 78 | 20180201 115656 200000010000002 5 7983 1 10000.00 S 62259910005002 11000002 其他代码 11/11/2018 02 79 | 20180201 115657 200000010000003 5 7983 1 10000.00 S 62259910005003 11000003 其他代码 11/12/2018 03 80 | 20180201 115658 200000010000004 5 7983 1 10000.00 S 62259910005004 11000001 其他代码 11/13/2018 04 81 | 20180201 115659 200000010000005 5 7983 1 10000.00 S 62259910005005 11000002 其他代码 11/14/2018 05 82 | 20180201 115660 200000010000006 5 7983 1 10000.00 S 62259910005006 11000003 其他代码 11/15/2018 06 83 | 20180201 115661 200000010000007 5 7983 1 10000.00 S 62259910005007 11000001 其他代码 11/16/2018 07 84 | 20180201 115662 200000010000008 5 7983 1 10000.00 S 62259910005008 11000002 其他代码 11/17/2018 08 85 | 20180201 115663 200000010000009 5 7983 1 10000.00 S 62259910005009 11000003 其他代码 11/18/2018 09 86 | Time taken: 1.571 seconds, Fetched: 9 row(s) 87 | 88 | ## 执行结果: 89 | 90 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 0 91 | 92 | 18/03/20 08:25:48 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0 93 | 18/03/20 08:25:48 INFO utils.AppInfoParser: Kafka commitId : unknown 94 | utils.eventRow(20180201,115655,200000010000001,5,7983,1,10000.00 ,S,62259910005001,11000001,其他代码,11/10/2018,01) 95 | utils.eventRow(20180201,115658,200000010000004,5,7983,1,10000.00 ,S,62259910005004,11000001,其他代码,11/13/2018,04) 96 | utils.eventRow(20180201,115661,200000010000007,5,7983,1,10000.00 ,S,62259910005007,11000001,其他代码,11/16/2018,07) 97 | 98 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 1 99 | 100 | 18/03/20 08:25:54 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0 101 | 18/03/20 08:25:54 INFO utils.AppInfoParser: Kafka commitId : unknown 102 | utils.eventRow(20180201,115656,200000010000002,5,7983,1,10000.00 ,S,62259910005002,11000002,其他代码,11/11/2018,02) 103 | utils.eventRow(20180201,115659,200000010000005,5,7983,1,10000.00 ,S,62259910005005,11000002,其他代码,11/14/2018,05) 104 | utils.eventRow(20180201,115662,200000010000008,5,7983,1,10000.00 ,S,62259910005008,11000002,其他代码,11/17/2018,08) 105 | 106 | kafka-console-consumer --topic yc --bootstrap-server datanode1:9092 --partition 2 107 | 108 | 18/03/20 08:25:59 INFO utils.AppInfoParser: Kafka version : 0.10.2-kafka-2.2.0 109 | 18/03/20 08:25:59 INFO utils.AppInfoParser: Kafka commitId : unknown 110 | utils.eventRow(20180201,115657,200000010000003,5,7983,1,10000.00 ,S,62259910005003,11000003,其他代码,11/12/2018,03) 111 | utils.eventRow(20180201,115660,200000010000006,5,7983,1,10000.00 ,S,62259910005006,11000003,其他代码,11/15/2018,06) 112 | utils.eventRow(20180201,115663,200000010000009,5,7983,1,10000.00 ,S,62259910005009,11000003,其他代码,11/18/2018,09) 113 | 114 | 115 | # 流式计算 116 | 117 | ## 执行命令 118 | 119 | spark2-submit \ 120 | --class textProject.streamingKafkaToIgnitePerformance \ 121 | --master yarn \ 122 | --deploy-mode client \ 123 | ./sparkstreamingkafkaperformance-1.0-SNAPSHOT.jar \ 124 | -cachename yc \ 125 | -igniteconfxml /opt/ignite/config/default-config.xml \ 126 | -brokers datanode1:9092 \ 127 | -partitionNum 3 \ 128 | -groupid yc \ 129 | -hiveTableName default.mm \ 130 | -topic yc \ 131 | -appName streamingToIgnite 132 | 133 | 134 | ## ignite写入结果 135 | 136 | 注意:这里只显示出了部分列,应该是ignite的问题,可以直接指定列区查询,就能显示所有 137 | 138 | 0: jdbc:ignite:thin://datanode2/> select * from yc; 139 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+ 140 | | JIOYRQ | JIOYSJ | GUIYLS | CPZNXH | JIAOYM | JIEDBZ | | 141 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+ 142 | | 20180201 | 115655 | 200000010000001 | 1 | 7983 | 1 | 10000.0 | 143 | | 20180201 | 115662 | 200000010000008 | 1 | 7983 | 1 | 10000.0 | 144 | | 20180201 | 115661 | 200000010000007 | 1 | 7983 | 1 | 10000.0 | 145 | | 20180201 | 115660 | 200000010000006 | 1 | 7983 | 1 | 10000.0 | 146 | | 20180201 | 115658 | 200000010000004 | 1 | 7983 | 1 | 10000.0 | 147 | | 20180201 | 115657 | 200000010000003 | 1 | 7983 | 1 | 10000.0 | 148 | | 20180201 | 115656 | 200000010000002 | 1 | 7983 | 1 | 10000.0 | 149 | | 20180201 | 115663 | 200000010000009 | 1 | 7983 | 1 | 10000.0 | 150 | | 20180201 | 115659 | 200000010000005 | 1 | 7983 | 1 | 10000.0 | 151 | +--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+--------------------------------+---------+ 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /KafkaTuning.md: -------------------------------------------------------------------------------- 1 | # producer tuning 2 | 3 | Most important configurations which needs to be taken care at Producer side are: 4 | 5 | **1. Compression** 6 | 7 | **2. Batch size** 8 | 9 | **3. Sync or Async** 10 | 11 | 主要是如下参数: 12 | 13 | batch.size: 基于大小的batching策略 14 | linger.ms: 基于时间的batching策略 15 | compression.type:压缩的速度上lz4=snappy 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | streamingPerformance 8 | sparkstreamingkafkaperformance 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | com.beust 14 | jcommander 15 | 1.71 16 | 17 | 18 | 19 | org.apache.spark 20 | spark-streaming-kafka-0-10_2.11 21 | 2.2.1 22 | 23 | 24 | 25 | org.apache.spark 26 | spark-streaming_2.11 27 | 2.2.1 28 | provided 29 | 30 | 31 | 32 | org.apache.hadoop 33 | hadoop-hdfs 34 | 2.6.5 35 | 36 | 37 | 38 | dom4j 39 | dom4j 40 | 1.6.1 41 | 42 | 43 | 44 | junit 45 | junit 46 | 3.8.1 47 | test 48 | 49 | 50 | 51 | org.apache.kafka 52 | kafka-clients 53 | 0.10.0.1 54 | 55 | 56 | 57 | org.apache.ignite 58 | ignite-spark 59 | 2.4.0 60 | 61 | 62 | jdk.tools 63 | jdk.tools 64 | 65 | 66 | 67 | 68 | 69 | org.apache.ignite 70 | ignite-core 71 | 2.4.0 72 | 73 | 74 | 75 | 76 | org.apache.spark 77 | spark-core_2.11 78 | 2.2.1 79 | 80 | 81 | 82 | org.apache.spark 83 | spark-sql_2.11 84 | 2.2.1 85 | 86 | 87 | 88 | org.apache.spark 89 | spark-catalyst_2.11 90 | 2.2.1 91 | 92 | 93 | 94 | org.apache.spark 95 | spark-network-common_2.11 96 | 2.2.1 97 | 98 | 99 | 100 | org.apache.spark 101 | spark-network-shuffle_2.11 102 | 2.2.1 103 | 104 | 105 | org.apache.spark 106 | spark-tags_2.11 107 | 2.2.1 108 | 109 | 110 | org.apache.spark 111 | spark-unsafe_2.11 112 | 2.2.1 113 | 114 | 115 | 116 | org.apache.hadoop 117 | hadoop-common 118 | 2.6.0 119 | provided 120 | 121 | 122 | org.apache.hadoop 123 | hadoop-client 124 | 2.6.0 125 | provided 126 | 127 | 128 | 129 | org.apache.hive 130 | hive-contrib 131 | 1.1.0 132 | 133 | 134 | 135 | 136 | org.apache.curator 137 | curator-recipes 138 | 2.9.1 139 | 140 | 141 | org.apache.curator 142 | curator-client 143 | 2.9.1 144 | 145 | 146 | org.apache.curator 147 | curator-x-discovery 148 | 2.9.1 149 | 150 | 151 | 152 | com.esotericsoftware 153 | kryo 154 | 4.0.2 155 | 156 | 157 | 158 | 159 | src/main/scala 160 | src/test 161 | 162 | 163 | 164 | org.apache.maven.plugins 165 | maven-shade-plugin 166 | 3.0.0 167 | 168 | 169 | package 170 | 171 | shade 172 | 173 | 174 | 175 | 176 | *:* 177 | 178 | META-INF/*.SF 179 | META-INF/*.DSA 180 | META-INF/*.RSA 181 | META-INF/DUMMY.DSA 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | org.codehaus.mojo 192 | exec-maven-plugin 193 | 1.3.2 194 | 195 | 196 | 197 | exec 198 | 199 | 200 | 201 | 202 | scala 203 | false 204 | false 205 | compile 206 | textProject.dataImportKafkaPerformance 207 | 208 | 209 | 210 | 229 | 230 | 231 | org.scala-tools 232 | maven-scala-plugin 233 | 2.15.2 234 | 235 | 236 | 237 | compile 238 | testCompile 239 | 240 | 241 | 242 | 243 | 244 | 245 | org.apache.maven.plugins 246 | maven-compiler-plugin 247 | 3.1 248 | 249 | 1.8 250 | 1.8 251 | 252 | 253 | 254 | org.apache.maven.plugins 255 | maven-assembly-plugin 256 | 257 | src/assembly/bin.xml 258 | ${project.name}-${project.version} 259 | 260 | 261 | 262 | package 263 | 264 | single 265 | 266 | 267 | 268 | 269 | 270 | 297 | 298 | 299 | 300 | 301 | -------------------------------------------------------------------------------- /smokeData/imputData.md: -------------------------------------------------------------------------------- 1 | 20180201,115655,200000010000001,1,7983,1,10000.00 ,S,62259910005001,11000001,其他代码,11/10/2018,01 2 | 20180201,115656,200000010000002,1,7983,1,10000.00 ,S,62259910005002,11000002,其他代码,11/11/2018,02 3 | 20180201,115657,200000010000003,1,7983,1,10000.00 ,S,62259910005003,11000003,其他代码,11/12/2018,03 4 | 20180201,115658,200000010000004,1,7983,1,10000.00 ,S,62259910005004,11000001,其他代码,11/13/2018,04 5 | 20180201,115659,200000010000005,1,7983,1,10000.00 ,S,62259910005005,11000002,其他代码,11/14/2018,05 6 | 20180201,115660,200000010000006,1,7983,1,10000.00 ,S,62259910005006,11000003,其他代码,11/15/2018,06 7 | 20180201,115661,200000010000007,1,7983,1,10000.00 ,S,62259910005007,11000001,其他代码,11/16/2018,07 8 | 20180201,115662,200000010000008,1,7983,1,10000.00 ,S,62259910005008,11000002,其他代码,11/17/2018,08 9 | 20180201,115663,200000010000009,1,7983,1,10000.00 ,S,62259910005009,11000003,其他代码,11/18/2018,09 10 | 11 | 12 | 注意:有中文编码是gbk,可以再建hive表时指定编码,这样spark读取出来就不会乱码 -------------------------------------------------------------------------------- /src/assembly/bin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | tar.gz 4 | 5 | 12 | 13 | 14 | smokeData 15 | smokeData 16 | 17 | 18 | configFile 19 | configFile 20 | 21 | 22 | bin 23 | bin 24 | 0755 25 | 0755 26 | 27 | *.sh 28 | 29 | unix 30 | 31 | 32 | 33 | target 34 | lib 35 | 0644 36 | 37 | ${project.name}-${project.version}.jar 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/main/java/mycallback.java: -------------------------------------------------------------------------------- 1 | import org.apache.kafka.clients.producer.Callback; 2 | import org.apache.kafka.clients.producer.RecordMetadata; 3 | 4 | public class mycallback implements Callback { 5 | @Override 6 | public void onCompletion(RecordMetadata metadata, Exception exception) { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/common/Args.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import com.beust.jcommander.Parameter 4 | 5 | class Args extends Serializable { 6 | 7 | @Parameter(names = Array("-appName"), required = true) var appName: String = null 8 | 9 | @Parameter(names = Array("-igniteconfxml"), required = true) var igniteconfxml: String = null 10 | 11 | @Parameter(names = Array("-cachename"), required = true) var cachename: String = null 12 | 13 | @Parameter(names = Array("-partitionNum"), required = true) var partitionNum: Integer = null 14 | 15 | @Parameter(names = Array("-brokers"), required = true) var brokers: String = null 16 | 17 | @Parameter(names = Array("-groupid"), required = true) var groupid: String = null 18 | 19 | @Parameter(names = Array("-topic"), required = true) var topic: String = null 20 | 21 | @Parameter(names = Array("-hiveTableName"), required = true) var hiveTableName: String = null 22 | 23 | @Parameter(names = Array("-topicCompression"), required = false) var topicCompression: String = "snappy" 24 | 25 | @Parameter(names = Array("-bufferMem"), required = false) var bufferMem: String = "33554432" 26 | 27 | @Parameter(names = Array("-lingerMs"), required = false) var lingerMs: String = "0" 28 | 29 | @Parameter(names = Array("-retries"), required = false) var retries: String = "0" 30 | 31 | @Parameter(names = Array("-durationTime"), required = false) var durationTime: Int = 500 32 | 33 | @Parameter(names = Array("-perConnection"), required = false) var perConnection: String = "1" 34 | 35 | @Parameter(names = Array("-batchSize"), required = false) var batchSize: String = "65536" 36 | 37 | @Parameter(names = Array("-allowOverwrite"), required = false) var allowOverwrite: Boolean = true 38 | 39 | @Parameter(names = Array("-primaryKey"), required = false) var primaryKey: String = "guiyls,kehhao,jioysj" 40 | 41 | @Parameter(names = Array("-writeMode"), required = false) var writeMode: String = "Append" 42 | 43 | @Parameter(names = Array("-tableParameters"), required = false) var tableParameters: String = "BACKUPS=1, ATOMICITY=TRANSACTIONAL, CACHE_NAME=yc, DATA_REGION=Default_Region" 44 | 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/common/KafkaSink.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | 4 | import java.util.concurrent.Future 5 | 6 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata} 7 | import org.apache.kafka.common.errors.InterruptException 8 | 9 | class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable { 10 | /* This is the key idea that allows us to work around running into 11 | NotSerializableExceptions. */ 12 | lazy val producer = createProducer() 13 | 14 | def send(topic: String, key: K, value: V): Future[RecordMetadata] = 15 | producer.send(new ProducerRecord[K, V](topic, key, value)) 16 | 17 | def send(topic: String, value: V): Future[RecordMetadata] = 18 | producer.send(new ProducerRecord[K, V](topic, value)) 19 | 20 | def send(topic: String, partitionNum: Integer, key: K, value: V) = 21 | producer.send(new ProducerRecord[K,V](topic, partitionNum, key, value)) 22 | 23 | def send(topic: String, partitionNum: Integer, key: K, value: V, callback: mycallback) = 24 | try { 25 | producer.send(new ProducerRecord[K, V](topic, partitionNum, key, value), new mycallback).get() 26 | } catch { 27 | case ex: InterruptException => print("inter") 28 | } 29 | } 30 | 31 | object KafkaSink { 32 | 33 | import scala.collection.JavaConversions._ 34 | 35 | def apply[K, V](config: Map[String, Object]): KafkaSink[String, Object] = { 36 | val createProducerFunc = () => { 37 | val producer = new KafkaProducer[String, Object](config) 38 | sys.addShutdownHook { 39 | // Ensure that, on executor JVM shutdown, the Kafka producer sends 40 | // any buffered messages to Kafka before shutting down. 41 | producer.close() 42 | } 43 | producer 44 | } 45 | new KafkaSink(createProducerFunc) 46 | } 47 | 48 | def apply[K, V](config: java.util.Properties): KafkaSink[String, Object] = apply(config.toMap) 49 | } 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /src/main/scala/common/eventRow.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import java.io.ByteArrayOutputStream 4 | 5 | import com.esotericsoftware.kryo.Kryo 6 | import com.esotericsoftware.kryo.io.{Input, Output} 7 | 8 | case class eventRow( 9 | jioyrq: String, 10 | jioysj: String, 11 | guiyls: String, 12 | cpznxh: String, 13 | jiaoym: String, 14 | jiedbz: String, 15 | jio1je: String, 16 | kemucc: String, 17 | kehuzh: String, 18 | kehhao: String, 19 | zhyodm: String, 20 | hmjsjc: String, 21 | huobdh: String 22 | ) { 23 | def setSerializationObjectByKryo(ob: Object): Array[Byte] = { 24 | 25 | var by = new ByteArrayOutputStream() 26 | var output = new Output(by) 27 | try { 28 | val kryo = new Kryo() 29 | kryo.writeObject(output, ob) 30 | output.close() 31 | }catch { 32 | case ex:Any => { 33 | ex.printStackTrace() 34 | } 35 | } 36 | by.toByteArray 37 | 38 | } 39 | 40 | def getSerializationObjectByKryo(bytes: Array[Byte]) = { 41 | 42 | var input = new Input(bytes) 43 | var event: eventRow = null 44 | 45 | try { 46 | val kryo = new Kryo() 47 | event = kryo.readObject(input, classOf[eventRow]) 48 | input.close() 49 | }catch { 50 | case ex:Any => { 51 | ex.printStackTrace() 52 | } 53 | } 54 | 55 | event 56 | 57 | } 58 | } 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/main/scala/common/igniteWriter.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import java.sql.{Connection, DriverManager} 4 | 5 | import com.sun.mail.iap.ConnectionException 6 | import org.apache.spark.sql.{ForeachWriter, Row} 7 | 8 | class igniteWriter(igniteJdbc: String) extends ForeachWriter[Row] { 9 | 10 | 11 | //"jdbc:ignite:cfg://file:///etc/config/ignite-jdbc.xml" 12 | //"INSERT INTO Person(_key, name, age) VALUES(CAST(? as BIGINT), ?, ?)" 13 | 14 | var connection: Connection = null 15 | Class.forName("org.apache.ignite.IgniteJdbcDriver") 16 | 17 | override def open(partitionId: Long, version: Long): Boolean = { 18 | 19 | try { 20 | connection = DriverManager.getConnection(igniteJdbc) 21 | } catch { 22 | case ex: ConnectionException => { 23 | ex.printStackTrace() 24 | println("连接ignite错误:"+igniteJdbc) 25 | } 26 | } 27 | 28 | true 29 | } 30 | 31 | override def process(value: Row): Unit = { 32 | val stmt = connection.prepareStatement("MERGE INTO yc(jioyrq,jioysj,guiyls,cpznxh,jiaoym,jiedbz,jio1je,kemucc,kehuzh,kehhao," + 33 | "zhyodm,hmjsjc,huobdh) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") 34 | val tmp:eventRow = eventRow(value.mkString(",")(0).toString.replace("(",""),value.mkString(",")(0).toString, 35 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString, 36 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString, 37 | value.mkString(",")(0).toString,value.mkString(",")(0).toString,value.mkString(",")(0).toString) 38 | stmt.setString(1,tmp.jioyrq) 39 | stmt.setString(2,tmp.jioysj) 40 | stmt.setString(3,tmp.guiyls) 41 | stmt.setString(4,tmp.cpznxh) 42 | stmt.setString(5,tmp.jiaoym) 43 | stmt.setString(6,tmp.jiedbz) 44 | stmt.setString(7,tmp.jio1je) 45 | stmt.setString(8,tmp.kemucc) 46 | stmt.setString(9,tmp.kehuzh) 47 | stmt.setString(10,tmp.kehhao) 48 | stmt.setString(11,tmp.zhyodm) 49 | stmt.setString(12,tmp.hmjsjc) 50 | stmt.setString(13,tmp.huobdh) 51 | stmt.execute() 52 | } 53 | 54 | override def close(errorOrNull: Throwable): Unit = connection.close() 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/common/kryoSerializer.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import java.io.ByteArrayOutputStream 4 | 5 | import com.esotericsoftware.kryo.Kryo 6 | import com.esotericsoftware.kryo.io.{Input, Output} 7 | 8 | object kryoSerializer { 9 | 10 | def setSerializationObjectByKryo(ob: Object):Array[Byte] = { 11 | 12 | var by = new ByteArrayOutputStream() 13 | var output = new Output(by) 14 | try { 15 | val kryo = new Kryo() 16 | kryo.writeObject(output, ob) 17 | output.close() 18 | }catch { 19 | case ex:Any => { 20 | ex.printStackTrace() 21 | } 22 | } 23 | by.toByteArray 24 | 25 | } 26 | 27 | def getSerializationObjectByKryo(bytes: Array[Byte]) = { 28 | 29 | var input = new Input(bytes) 30 | var event:eventRow = null 31 | 32 | try { 33 | val kryo = new Kryo() 34 | event = kryo.readObject(input,classOf[eventRow]) 35 | input.close() 36 | }catch { 37 | case ex:Any => { 38 | ex.printStackTrace() 39 | } 40 | } 41 | 42 | event 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/common/mycallback.scala: -------------------------------------------------------------------------------- 1 | package common 2 | 3 | import org.apache.kafka.clients.producer.{Callback, RecordMetadata} 4 | 5 | class mycallback extends Callback{ 6 | override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = ??? 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/objectProject/dataImportKafkaPerformance.scala: -------------------------------------------------------------------------------- 1 | package objectProject 2 | 3 | import java.util.Properties 4 | 5 | import com.beust.jcommander.JCommander 6 | import org.apache.kafka.common.serialization.{ByteArraySerializer, StringSerializer} 7 | import org.apache.log4j.Logger 8 | import org.apache.spark.broadcast.Broadcast 9 | import org.apache.spark.sql.SparkSession 10 | import common.{Args, KafkaSink, eventRow, kryoSerializer} 11 | 12 | class dataImportKafkaPerformance() { 13 | 14 | } 15 | 16 | object dataImportKafkaPerformance { 17 | 18 | private val log = Logger.getLogger(classOf[dataImportKafkaPerformance]) 19 | val sTime: Long = System.currentTimeMillis 20 | 21 | def main(args: Array[String]): Unit = { 22 | 23 | dataImportKafka(args) 24 | 25 | } 26 | 27 | def dataImportKafka(args: Array[String]): Unit = { 28 | //获取传入参数 29 | log.info("========================================== 初始化jcommander ==========================================") 30 | val argv = new Args() 31 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 32 | 33 | //创建sparksession 34 | val spark = SparkSession 35 | .builder() 36 | .appName(argv.appName) 37 | .enableHiveSupport() 38 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 39 | .getOrCreate() 40 | 41 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow])) 42 | 43 | import spark.implicits._ 44 | 45 | /** 46 | * only used for test 47 | */ 48 | //log.warn("打印出所有的配置项,供优化参考: \n" + spark.conf.getAll) 49 | 50 | log.info("========================================== 初始化kafka producer ==========================================") 51 | val kafkaProducer: Broadcast[KafkaSink[String, Object]] = { 52 | val kafkaProducerConfig = { 53 | val p = new Properties() 54 | p.setProperty("bootstrap.servers", argv.brokers) 55 | p.setProperty("acks", "all") 56 | p.setProperty("max.in.flight.requests.per.connection", argv.perConnection) 57 | p.setProperty("batch.size", argv.batchSize) 58 | p.setProperty("retries", argv.retries) 59 | p.setProperty("linger.ms", argv.lingerMs) 60 | p.setProperty("buffer.memory", argv.bufferMem) 61 | p.setProperty("compression.type", argv.topicCompression) 62 | p.setProperty("key.serializer", classOf[StringSerializer].getName) 63 | p.setProperty("value.serializer", classOf[ByteArraySerializer].getName) 64 | p 65 | } 66 | log.warn("kafka producer init done!") 67 | spark.sparkContext.broadcast(KafkaSink[String, Object](kafkaProducerConfig)) 68 | } 69 | 70 | /** 71 | * read data from hive 72 | */ 73 | 74 | val tableDF = spark.table(argv.hiveTableName).select( 75 | "jioyrq", 76 | "jioysj", 77 | "guiyls", 78 | "cpznxh", 79 | "jiaoym", 80 | "jiedbz", 81 | "jio1je", 82 | "kemucc", 83 | "kehuzh", 84 | "kehhao", 85 | "zhyodm", 86 | "hmjsjc", 87 | "huobdh") 88 | 89 | /** 90 | * 一行读取出来,然后判断一行中部分字段是否有业务逻辑问题,如有则记录,发送到error kafka topic中 91 | * 92 | * 输入的是df,然后需要对每一行的每一个字段进行逻辑判断,满足要求则直接取值,不满足要求则改变其值,然后返回一行新的row,最后返回一个新的df 93 | */ 94 | 95 | log.info("========================================== 开始转换df ==========================================") 96 | val filterTableDF = tableDF.map(newRow => 97 | (eventRow(newRow(0).toString,if ((!(newRow(1).toString).equals(""))) newRow(1).toString else "0", 98 | newRow(2).toString, if (!((newRow(3).toString).equals(""))) newRow(3).toString else "0", newRow(4).toString, 99 | newRow(5).toString, if (!((newRow(6).toString).equals(""))) newRow(6).toString else "0", newRow(7).toString, newRow(8).toString, 100 | newRow(9).toString, newRow(10).toString, newRow(11).toString, newRow(12).toString)) 101 | ) 102 | 103 | /** 104 | * 进行二次排序 105 | */ 106 | log.info("========================================== 开始二次排序 ==========================================") 107 | 108 | import org.apache.spark.sql._ 109 | val sortFilterTableDF = filterTableDF.repartition(argv.partitionNum,new Column("kehhao")).sortWithinPartitions("huobdh") 110 | 111 | /** 112 | * 然后调用foreatchPartition写入对应的分区,这里是否需要自定义partitioner? 113 | */ 114 | 115 | log.info("========================================== 开始写入kafka ==========================================") 116 | /* 117 | sortFilterTableDF.rdd.mapPartitions(rows => { 118 | log.info("========================================== kafka 1 ==========================================") 119 | rows.map(row => { 120 | val kafkaPartition: Int = row.kehhao.toInt % argv.partitionNum 121 | log.info("kafkaPartition===============" + kafkaPartition) 122 | kafkaProducer.value.send(argv.topic, kafkaPartition ,row.kehhao.toString, row.toString) 123 | }) 124 | }).collect() 125 | */ 126 | 127 | sortFilterTableDF.foreachPartition(rows=>{ 128 | while (rows.hasNext){ 129 | val tmp = rows.next() 130 | var kafkaPartition = 0 131 | try { 132 | kafkaPartition = tmp.kehhao.trim.toInt % argv.partitionNum 133 | }catch{ 134 | case ex: NumberFormatException =>{ 135 | println(ex.getMessage) 136 | log.warn("异常数据:"+tmp.toString()) 137 | } 138 | case ex: Any => { 139 | println("Unkown error!!") 140 | } 141 | } 142 | //log.info("kafkaPartition===============" + kafkaPartition) 143 | kafkaProducer.value.send(argv.topic, kafkaPartition ,tmp.kehhao.toString+"_"+tmp.guiyls.toString+"_"+tmp.jioysj.toString, 144 | kryoSerializer.setSerializationObjectByKryo(tmp)) 145 | } 146 | }) 147 | 148 | kafkaProducer.value.producer.flush() 149 | kafkaProducer.value.producer.close() 150 | 151 | spark.close() 152 | } 153 | 154 | } 155 | -------------------------------------------------------------------------------- /src/main/scala/objectProject/streamingKafkaToIgnitePerformance.scala: -------------------------------------------------------------------------------- 1 | package objectProject 2 | 3 | import com.beust.jcommander.JCommander 4 | import org.apache.ignite.spark.IgniteDataFrameSettings._ 5 | import org.apache.kafka.common.serialization.{ByteArrayDeserializer, StringDeserializer} 6 | import org.apache.log4j.Logger 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 10 | import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange} 11 | import org.apache.spark.streaming.{Duration, StreamingContext} 12 | import common.{Args, eventRow, kryoSerializer} 13 | 14 | class streamingKafkaToIgnitePerformance { 15 | 16 | } 17 | 18 | object streamingKafkaToIgnitePerformance { 19 | 20 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance]) 21 | 22 | def main(args: Array[String]): Unit = { 23 | 24 | /** 25 | * 获取输入参数与定义全局变量 26 | */ 27 | 28 | log.info("获取输入变量") 29 | val argv = new Args() 30 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 31 | 32 | /** 33 | * 创建source/dest context 34 | */ 35 | log.info("初始sparkcontext和kuducontext") 36 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate() 37 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow])) 38 | 39 | val ssc = new StreamingContext(spark.sparkContext, Duration(argv.durationTime)) 40 | ssc.checkpoint("/tmp/streamingToIgnite") 41 | 42 | /** 43 | * 初始化igniteContext 44 | */ 45 | /* 46 | log.info("========================================== 初始化ignite ==========================================") 47 | val igniteContext = new IgniteContext(spark.sparkContext, argv.igniteconfxml, true) 48 | val fromCache: IgniteRDD[String, String] = igniteContext.fromCache(argv.cachename) 49 | */ 50 | 51 | /** 52 | * 创建多线程kafka数据流 53 | */ 54 | log.info("初始化kafka数据流") 55 | val kafkaParams = Map[String, Object]( 56 | "bootstrap.servers" -> argv.brokers, 57 | "key.deserializer" -> classOf[StringDeserializer], 58 | "value.deserializer" -> classOf[ByteArrayDeserializer], 59 | "group.id" -> argv.groupid, 60 | "auto.offset.reset" -> "latest", 61 | "session.timeout.ms" -> "30000", 62 | "enable.auto.commit" -> (false: java.lang.Boolean) 63 | ) 64 | val topics = Array(argv.topic) 65 | 66 | val stream = KafkaUtils.createDirectStream[String, Array[Byte]](ssc, PreferConsistent, Subscribe[String, Array[Byte]](topics, kafkaParams)) 67 | 68 | /** 69 | * 开始处理数据 70 | */ 71 | log.info("开始处理数据") 72 | 73 | var offsetRanges = Array[OffsetRange]() 74 | 75 | stream.foreachRDD(rdd => { 76 | 77 | 78 | offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 79 | 80 | /** 81 | * 仅测试,输出offset, key, value 82 | * 83 | * 如果要存储offset也可以用同样的方法去做 84 | */ 85 | /* 86 | for (record <- rdd) { 87 | System.out.printf("offset = %d, key = %s, value = %s\n", 88 | record.offset(), record.key(), record.value()); 89 | } 90 | */ 91 | 92 | val valueRDD = rdd.map(x=>(x.key(),kryoSerializer.getSerializationObjectByKryo(x.value()))) 93 | 94 | log.info("开始写入ignite") 95 | 96 | import spark.implicits._ 97 | val df = valueRDD.toDF() 98 | 99 | df.write 100 | .format(FORMAT_IGNITE) 101 | .option(OPTION_CONFIG_FILE, argv.igniteconfxml) 102 | .option(OPTION_TABLE, argv.cachename) 103 | .mode(argv.writeMode) 104 | .option(OPTION_STREAMER_ALLOW_OVERWRITE, argv.allowOverwrite) 105 | .option(OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS, argv.primaryKey) 106 | .option(OPTION_CREATE_TABLE_PARAMETERS, argv.tableParameters) 107 | .save() 108 | 109 | stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) 110 | 111 | }) 112 | 113 | // TODO 判断流是否为空,如果为空则不提交任务,节省调度时间 114 | ssc.start() 115 | ssc.awaitTermination() 116 | 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /src/main/scala/objectProject/structuredStreamingKafkaToIgnitePerformance.scala: -------------------------------------------------------------------------------- 1 | package objectProject 2 | 3 | 4 | import com.beust.jcommander.JCommander 5 | import org.apache.log4j.Logger 6 | import org.apache.spark.sql.SparkSession 7 | import common.{Args, igniteWriter} 8 | 9 | class structuredStreamingKafkaToIgnitePerformance { 10 | 11 | } 12 | 13 | object structuredStreamingKafkaToIgnitePerformance { 14 | 15 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance]) 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | /** 20 | * 获取输入参数与定义全局变量 21 | */ 22 | 23 | log.info("获取输入变量") 24 | val argv = new Args() 25 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 26 | 27 | /** 28 | * 创建source/dest context 29 | */ 30 | log.info("初始sparkcontext") 31 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate() 32 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args])) 33 | 34 | val kafkaParams = Map[String, String]( 35 | "subscribe" -> argv.topic, 36 | "kafka.bootstrap.servers" -> argv.brokers, 37 | "group.id" -> argv.groupid, 38 | "auto.offset.reset" -> "latest", 39 | "session.timeout.ms" -> "30000" 40 | ) 41 | 42 | val records = spark.readStream.format("kafka").options(kafkaParams) 43 | .option("enable.auto.commit", (false: java.lang.Boolean)) 44 | .option("checkpointLocation", "/tmp/structuredStreaming") 45 | .load() 46 | 47 | /** 48 | * 开始处理数据 49 | */ 50 | 51 | val recordsVlues = records.selectExpr("CAST(value AS STRING)") 52 | 53 | val igniteJdbc = "jdbc:ignite:cfg://file://" + argv.igniteconfxml 54 | recordsVlues.writeStream.foreach(new igniteWriter(igniteJdbc)).outputMode("append").start().awaitTermination() 55 | 56 | } 57 | 58 | } 59 | 60 | -------------------------------------------------------------------------------- /src/main/scala/textProject/dataImportKafkaPerformance.scala: -------------------------------------------------------------------------------- 1 | package textProject 2 | 3 | import java.util.Properties 4 | 5 | import com.beust.jcommander.JCommander 6 | import org.apache.kafka.common.serialization.StringSerializer 7 | import org.apache.log4j.Logger 8 | import org.apache.spark.broadcast.Broadcast 9 | import org.apache.spark.sql.SparkSession 10 | import common.{Args, KafkaSink, eventRow} 11 | 12 | class dataImportKafkaPerformance() { 13 | 14 | } 15 | 16 | object dataImportKafkaPerformance { 17 | 18 | private val log = Logger.getLogger(classOf[dataImportKafkaPerformance]) 19 | val sTime: Long = System.currentTimeMillis 20 | 21 | def main(args: Array[String]): Unit = { 22 | 23 | dataImportKafka(args) 24 | 25 | } 26 | 27 | def dataImportKafka(args: Array[String]): Unit = { 28 | //获取传入参数 29 | log.info("========================================== 初始化jcommander ==========================================") 30 | val argv = new Args() 31 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 32 | 33 | //创建sparksession 34 | val spark = SparkSession 35 | .builder() 36 | .appName(argv.appName) 37 | .enableHiveSupport() 38 | .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 39 | .getOrCreate() 40 | 41 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow])) 42 | 43 | import spark.implicits._ 44 | 45 | /** 46 | * only used for test 47 | */ 48 | log.warn("打印出所有的配置项,供优化参考: \n" + spark.conf.getAll) 49 | 50 | log.info("========================================== 初始化kafka producer ==========================================") 51 | val kafkaProducer: Broadcast[KafkaSink[String, Object]] = { 52 | val kafkaProducerConfig = { 53 | val p = new Properties() 54 | p.setProperty("bootstrap.servers", argv.brokers) 55 | p.setProperty("acks", "all") 56 | p.setProperty("max.in.flight.requests.per.connection", argv.perConnection) 57 | p.setProperty("batch.size", argv.batchSize) 58 | p.setProperty("retries", argv.retries) 59 | p.setProperty("linger.ms", argv.lingerMs) 60 | p.setProperty("buffer.memory", argv.bufferMem) 61 | p.setProperty("compression.type", argv.topicCompression) 62 | p.setProperty("key.serializer", classOf[StringSerializer].getName) 63 | p.setProperty("value.serializer", classOf[StringSerializer].getName) 64 | p 65 | } 66 | log.warn("kafka producer init done!") 67 | spark.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig)) 68 | } 69 | 70 | /** 71 | * read data from hive 72 | */ 73 | 74 | val tableDF = spark.table(argv.hiveTableName).select( 75 | "jioyrq", 76 | "jioysj", 77 | "guiyls", 78 | "cpznxh", 79 | "jiaoym", 80 | "jiedbz", 81 | "jio1je", 82 | "kemucc", 83 | "kehuzh", 84 | "kehhao", 85 | "zhyodm", 86 | "hmjsjc", 87 | "huobdh") 88 | 89 | /** 90 | * 一行读取出来,然后判断一行中部分字段是否有业务逻辑问题,如有则记录,发送到error kafka topic中 91 | * 92 | * 输入的是df,然后需要对每一行的每一个字段进行逻辑判断,满足要求则直接取值,不满足要求则改变其值,然后返回一行新的row,最后返回一个新的df 93 | */ 94 | 95 | log.info("========================================== 开始转换df ==========================================") 96 | val filterTableDF = tableDF.map(newRow => 97 | (newRow(0).toString, if ((!(newRow(1).toString).equals(""))) newRow(1).toString else "0", 98 | newRow(2).toString, if (!((newRow(3).toString).equals(""))) newRow(3).toString else "0", newRow(4).toString, 99 | newRow(5).toString, if (!((newRow(6).toString).equals(""))) newRow(6).toString else "0", newRow(7).toString, newRow(8).toString, 100 | newRow(9).toString, newRow(10).toString, newRow(11).toString, newRow(12).toString) 101 | ) 102 | 103 | /** 104 | * 进行二次排序 105 | */ 106 | log.info("========================================== 开始二次排序 ==========================================") 107 | 108 | import org.apache.spark.sql._ 109 | val sortFilterTableDF = filterTableDF.repartition(argv.partitionNum,new Column("_10")).sortWithinPartitions("_12") 110 | 111 | /** 112 | * 然后调用foreatchPartition写入对应的分区,这里是否需要自定义partitioner? 113 | */ 114 | 115 | log.info("========================================== 开始写入kafka ==========================================") 116 | /* 117 | sortFilterTableDF.rdd.mapPartitions(rows => { 118 | log.info("========================================== kafka 1 ==========================================") 119 | rows.map(row => { 120 | val kafkaPartition: Int = row.kehhao.toInt % argv.partitionNum 121 | log.info("kafkaPartition===============" + kafkaPartition) 122 | kafkaProducer.value.send(argv.topic, kafkaPartition ,row.kehhao.toString, row.toString) 123 | }) 124 | }).collect() 125 | */ 126 | 127 | sortFilterTableDF.foreachPartition(rows=>{ 128 | while (rows.hasNext){ 129 | val tmp = rows.next() 130 | var kafkaPartition = 0 131 | try { 132 | kafkaPartition = tmp._10.trim.toInt % argv.partitionNum 133 | }catch{ 134 | case ex: NumberFormatException =>{ 135 | println(ex.getMessage) 136 | log.warn("异常数据:"+tmp.toString()) 137 | } 138 | case ex: Any => { 139 | println("Unkown error!!") 140 | } 141 | } 142 | //log.info("kafkaPartition===============" + kafkaPartition) 143 | kafkaProducer.value.send(argv.topic, kafkaPartition ,tmp._10.toString, tmp.toString()) 144 | } 145 | }) 146 | 147 | kafkaProducer.value.producer.flush() 148 | kafkaProducer.value.producer.close() 149 | 150 | spark.close() 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /src/main/scala/textProject/streamingKafkaToIgnitePerformance.scala: -------------------------------------------------------------------------------- 1 | package textProject 2 | 3 | import com.beust.jcommander.JCommander 4 | import org.apache.ignite.spark.IgniteDataFrameSettings._ 5 | import org.apache.kafka.common.serialization.StringDeserializer 6 | import org.apache.log4j.Logger 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 10 | import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange} 11 | import org.apache.spark.streaming.{Duration, StreamingContext} 12 | import common.{Args, eventRow} 13 | 14 | class streamingKafkaToIgnitePerformance { 15 | 16 | } 17 | 18 | object streamingKafkaToIgnitePerformance { 19 | 20 | private val log = Logger.getLogger(classOf[streamingKafkaToIgnitePerformance]) 21 | 22 | def main(args: Array[String]): Unit = { 23 | 24 | /** 25 | * 获取输入参数与定义全局变量 26 | */ 27 | 28 | log.info("获取输入变量") 29 | val argv = new Args() 30 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 31 | 32 | /** 33 | * 创建source/dest context 34 | */ 35 | log.info("初始sparkcontext和kuducontext") 36 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate() 37 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args],classOf[eventRow])) 38 | 39 | val ssc = new StreamingContext(spark.sparkContext, Duration(argv.durationTime)) 40 | ssc.checkpoint("/tmp/streamingToIgnite") 41 | 42 | /** 43 | * 初始化igniteContext 44 | */ 45 | /* 46 | log.info("========================================== 初始化ignite ==========================================") 47 | val igniteContext = new IgniteContext(spark.sparkContext, argv.igniteconfxml, true) 48 | val fromCache: IgniteRDD[String, String] = igniteContext.fromCache(argv.cachename) 49 | */ 50 | 51 | /** 52 | * 创建多线程kafka数据流 53 | */ 54 | log.info("初始化kafka数据流") 55 | val kafkaParams = Map[String, Object]( 56 | "bootstrap.servers" -> argv.brokers, 57 | "key.deserializer" -> classOf[StringDeserializer], 58 | "value.deserializer" -> classOf[StringDeserializer], 59 | "group.id" -> argv.groupid, 60 | "auto.offset.reset" -> "latest", 61 | "session.timeout.ms" -> "30000", 62 | "enable.auto.commit" -> (false: java.lang.Boolean) 63 | ) 64 | val topics = Array(argv.topic) 65 | 66 | val stream = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)) 67 | 68 | /** 69 | * 开始处理数据 70 | */ 71 | log.info("开始处理数据") 72 | 73 | var offsetRanges = Array[OffsetRange]() 74 | 75 | stream.foreachRDD(rdd => { 76 | 77 | 78 | offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 79 | 80 | /** 81 | * 仅测试,输出offset, key, value 82 | * 83 | * 如果要存储offset也可以用同样的方法去做 84 | */ 85 | /* 86 | for (record <- rdd) { 87 | System.out.printf("offset = %d, key = %s, value = %s\n", 88 | record.offset(), record.key(), record.value()); 89 | } 90 | */ 91 | 92 | val valueRDD = rdd.map(_.value().split(",")) 93 | 94 | log.info("开始写入ignite") 95 | 96 | import spark.implicits._ 97 | val df = valueRDD.map(x => eventRow(x(0).replace("(", ""), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), 98 | x(10), x(11), x(12).replace(")", ""))).toDF() 99 | 100 | df.write 101 | .format(FORMAT_IGNITE) 102 | .option(OPTION_CONFIG_FILE, argv.igniteconfxml) 103 | .option(OPTION_TABLE, argv.cachename) 104 | .mode(argv.writeMode) 105 | .option(OPTION_STREAMER_ALLOW_OVERWRITE, argv.allowOverwrite) 106 | .option(OPTION_CREATE_TABLE_PRIMARY_KEY_FIELDS, argv.primaryKey) 107 | .option(OPTION_CREATE_TABLE_PARAMETERS, argv.tableParameters) 108 | .save() 109 | 110 | stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges) 111 | 112 | }) 113 | 114 | // TODO 判断流是否为空,如果为空则不提交任务,节省调度时间 115 | ssc.start() 116 | ssc.awaitTermination() 117 | 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /src/main/scala/textProject/structuredStreamingKafkaToIgnitePerformance.scala: -------------------------------------------------------------------------------- 1 | package textProject 2 | 3 | 4 | import com.beust.jcommander.JCommander 5 | import org.apache.log4j.Logger 6 | import org.apache.spark.sql.SparkSession 7 | import common.{Args, igniteWriter} 8 | 9 | class structuredStreamingKafkaToIgnitePerformance { 10 | 11 | } 12 | 13 | object structuredStreamingKafkaToIgnitePerformance { 14 | 15 | private val log = Logger.getLogger(classOf[structuredStreamingKafkaToIgnitePerformance]) 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | /** 20 | * 获取输入参数与定义全局变量 21 | */ 22 | 23 | log.info("获取输入变量") 24 | val argv = new Args() 25 | JCommander.newBuilder().addObject(argv).build().parse(args: _*) 26 | 27 | /** 28 | * 创建source/dest context 29 | */ 30 | log.info("初始sparkcontext") 31 | val spark = SparkSession.builder().appName(argv.appName).enableHiveSupport().getOrCreate() 32 | spark.sparkContext.getConf.registerKryoClasses(Array(classOf[Args])) 33 | 34 | val kafkaParams = Map[String, String]( 35 | "subscribe" -> argv.topic, 36 | "kafka.bootstrap.servers" -> argv.brokers, 37 | "group.id" -> argv.groupid, 38 | "auto.offset.reset" -> "latest", 39 | "session.timeout.ms" -> "30000" 40 | ) 41 | 42 | val records = spark.readStream.format("kafka").options(kafkaParams) 43 | .option("enable.auto.commit", (false: java.lang.Boolean)) 44 | .option("checkpointLocation", "/tmp/structuredStreaming") 45 | .load() 46 | 47 | /** 48 | * 开始处理数据 49 | */ 50 | 51 | val recordsVlues = records.selectExpr("CAST(value AS STRING)") 52 | 53 | val igniteJdbc = "jdbc:ignite:cfg://file://" + argv.igniteconfxml 54 | recordsVlues.writeStream.foreach(new igniteWriter(igniteJdbc)).outputMode("append").start().awaitTermination() 55 | 56 | } 57 | 58 | } 59 | 60 | --------------------------------------------------------------------------------