├── README.md ├── src └── main │ ├── scala │ └── com │ │ └── venn │ │ ├── stream │ │ └── api │ │ │ ├── broadcast │ │ │ ├── readmd.md │ │ │ └── BroadCastDemo.scala │ │ │ ├── intervalJoin │ │ │ ├── IntervalUser.scala │ │ │ ├── IntervalJoinProcessFunctionDemo.scala │ │ │ ├── IntervalJoinKafkaKeyMaker.scala │ │ │ └── IntervalJoinDemo.scala │ │ │ ├── sideoutput │ │ │ └── lateDataProcess │ │ │ │ └── readme.md │ │ │ ├── dayWindow │ │ │ ├── CurrentDayMaker.scala │ │ │ └── CurrentDayPvCount.scala │ │ │ ├── trigger │ │ │ └── ProcessWindowForTrigger.scala │ │ │ ├── checkpoint │ │ │ └── CheckpointDebug.scala │ │ │ ├── tableJoin │ │ │ └── CacheFile.scala │ │ │ └── timer │ │ │ └── CustomerTimerDemo.scala │ │ ├── connector │ │ ├── jdbcOutput │ │ │ ├── User.scala │ │ │ ├── MysqlOutputMaker.scala │ │ │ ├── MysqlOutputDemo.scala │ │ │ ├── MysqlSink1.scala │ │ │ └── MysqlSink.scala │ │ ├── filesink │ │ │ ├── filesink.md │ │ │ ├── DayBasePathBucketer.scala │ │ │ ├── FileSinkMaker.scala │ │ │ ├── DayBucketAssigner.scala │ │ │ ├── DayBulkWriter.scala │ │ │ ├── StreamingFileSinkDemo.scala │ │ │ └── RollingFileSinkDemo.scala │ │ ├── starrocks │ │ │ ├── Column.java │ │ │ ├── TableSchema.java │ │ │ ├── CustJdbcSource.java │ │ │ ├── StreamLoadTestV2.scala │ │ │ └── StreamLoadTest.scala │ │ ├── cdc │ │ │ └── CdcDdlTest.scala │ │ ├── pulsar │ │ │ └── PulsarDemo.scala │ │ └── kafka │ │ │ └── KafkaSinkTest.scala │ │ ├── question │ │ ├── retention │ │ │ └── UserLog.scala │ │ ├── stock │ │ │ ├── entry │ │ │ │ ├── Stock.java │ │ │ │ ├── OverStockDetail.java │ │ │ │ ├── OverStock.java │ │ │ │ ├── StockList.java │ │ │ │ └── StockListDetail.java │ │ │ ├── util │ │ │ │ ├── OverStockFlatMapFunction.scala │ │ │ │ └── StockCommon.scala │ │ │ └── README.md │ │ ├── processAndEvent │ │ │ └── SimpleProcessFunction.scala │ │ ├── dynamicWindow │ │ │ ├── DataSourceFunction.scala │ │ │ ├── DyTumblingWindow.java │ │ │ ├── DyProcessWindowFunction.scala │ │ │ └── readme.md │ │ ├── cdcStarrocks │ │ │ ├── CdcRecord.java │ │ │ ├── CdcStarMapFunction.java │ │ │ ├── CdcToStarRocks.java │ │ │ └── CdcStarProcessFunction.java │ │ ├── UserClue │ │ │ ├── UserClue.scala │ │ │ └── question.md │ │ ├── late1mtps │ │ │ ├── LateTpsProcessWindowFunction.scala │ │ │ └── LateTps.scala │ │ ├── dataFluctuation │ │ │ └── DataFluctuation.scala │ │ └── tryFlink │ │ │ └── FraudDetection.scala │ │ ├── table │ │ └── TableApiDemo.java │ │ ├── util │ │ ├── TwoStringSource.scala │ │ ├── MathUtil.java │ │ ├── StringUtil.java │ │ ├── CheckpointUtil.scala │ │ └── HttpClientUtil.java │ │ ├── demo │ │ ├── CustomerSource.scala │ │ ├── SlotPartitionDemo.scala │ │ ├── FilterTest.scala │ │ ├── SlotPartitionMaker.scala │ │ └── relationCntA.scala │ │ ├── source │ │ ├── cust │ │ │ ├── ReadHttpWordCount.java │ │ │ ├── CustHttpSource.java │ │ │ └── HttpServer.java │ │ ├── kafka │ │ │ ├── kafkaToKafkaGroup.sql │ │ │ ├── KafkaUpsertTableSink.java │ │ │ └── KafkaUpsertTableSourceSinkFactory.java │ │ ├── RichAsyncFunction.scala │ │ └── mysql │ │ │ └── cdc │ │ │ ├── CommonKafkaSink.java │ │ │ ├── Binlog.java │ │ │ └── MySqlBinlogSourceExample.java │ │ ├── cep │ │ ├── cep.md │ │ ├── ContinueRising.scala │ │ └── AfterMatchStrategyDemo.scala │ │ └── common │ │ ├── Common.java │ │ └── MySqlDateTimeConverter.java │ ├── resources │ ├── data │ │ └── tablejoin.txt │ ├── cdc_demo.properties │ ├── sql │ │ ├── kafkaJsonSourceSinkDemo.sql │ │ └── sqlDemo.sql │ └── log4j.properties │ ├── java │ └── com │ │ └── venn │ │ ├── entity │ │ ├── EntityObject.java │ │ ├── Behavior.java │ │ ├── StreamElement.java │ │ ├── KafkaSimpleStringRecord.java │ │ └── UserLog.java │ │ ├── question │ │ └── LateTps.java │ │ ├── flink │ │ └── asyncio │ │ │ ├── MysqlData.java │ │ │ ├── AsyncUser.java │ │ │ ├── AsyncHbaseRequest.java │ │ │ ├── AsyncFunctionForMysqlJava.java │ │ │ ├── AsyncMysqlRequest.java │ │ │ ├── AsyncFunctionForHbaseJava.java │ │ │ └── MysqlClient.java │ │ ├── util │ │ └── SimpleKafkaRecordDeserializationSchema.java │ │ └── demo │ │ ├── KafkaJoinRedisDemo.java │ │ ├── AsyncRedisFunction.java │ │ └── TypeTest.java │ └── test │ └── com │ └── venn │ └── connector │ └── kafka │ └── KafkaOffsetRevertTest.scala ├── doc ├── Flink Table Api & SQL.pdf └── Flink Table Api & SQL.docx ├── .gitignore └── git.sh /README.md: -------------------------------------------------------------------------------- 1 | # flink-rookie 2 | Flink 菜鸟公众号代码地址 3 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/broadcast/readmd.md: -------------------------------------------------------------------------------- 1 | ## 读取广播变量 -------------------------------------------------------------------------------- /src/main/resources/data/tablejoin.txt: -------------------------------------------------------------------------------- 1 | 1,venn 2 | 2,mary 3 | 3,tom 4 | 4,join -------------------------------------------------------------------------------- /doc/Flink Table Api & SQL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springMoon/flink-rookie/HEAD/doc/Flink Table Api & SQL.pdf -------------------------------------------------------------------------------- /doc/Flink Table Api & SQL.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springMoon/flink-rookie/HEAD/doc/Flink Table Api & SQL.docx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | target/ 3 | /.idea 4 | .idea/ 5 | .idea/workspace.xml 6 | .idea/compiler.xml 7 | .idea/misc.xml 8 | *.iml -------------------------------------------------------------------------------- /src/main/java/com/venn/entity/EntityObject.java: -------------------------------------------------------------------------------- 1 | package com.venn.entity; 2 | 3 | public abstract class EntityObject { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/jdbcOutput/User.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.jdbcOutput 2 | 3 | case class User(username: String, password: String, sex: Int, phone: String) -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/intervalJoin/IntervalUser.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.intervalJoin 2 | 3 | case class IntervalUser(id: String, name: String, phone:String, date: String) 4 | -------------------------------------------------------------------------------- /git.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | message="update today" 4 | if [ -n "$1" ]; then 5 | message=$1 6 | fi 7 | 8 | git pull 9 | git add * 10 | git commit -m "$message ` date -d now +"%F %T"`" 11 | git push 12 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/retention/UserLog.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.retention 2 | 3 | case class UserLog(userId: String, categoryId: Int, itemId: Int, behavior: String, ts: String, tsLong: Long) 4 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/filesink.md: -------------------------------------------------------------------------------- 1 | ## file sink for user define file name 2 | * BucketingSink 3 | * StreamingFileSink 4 | 5 | ```txt 6 | 用BucketingSink的话重写BasePathBucket 7 | 用StreamingFileSink的话自定义BucketAssigner 8 | ``` -------------------------------------------------------------------------------- /src/main/scala/com/venn/table/TableApiDemo.java: -------------------------------------------------------------------------------- 1 | package com.venn.table; 2 | 3 | public class TableApiDemo { 4 | public static void main(String[] args) { 5 | // TableEnvironment tableEnv = TableEnvironment.create(/*…*/); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/entry/Stock.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.entry; 2 | 3 | /** 4 | * @Classname Stock 5 | * @Description TODO 6 | * @Date 2023/6/12 7 | * @Created by venn 8 | */ 9 | public interface Stock { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/sideoutput/lateDataProcess/readme.md: -------------------------------------------------------------------------------- 1 | ## 延迟数据处理 and keyBy 的key数量 2 | 木三:FLINK 对于事件时间延迟的数据,用侧输出接收,内部是什么接收的,有看过源码的大神,帮指点一下 3 | 4 | 研究一下这部分的源码。 5 | 6 | 步骤: 7 | * 1、开发一下基于事件时间的flink程序 8 | * 2、加上迟到数据的SideOut 9 | * 3、debug对应源码,查看迟到数据处理流程与对应源码位置 10 | 11 | -------------------------------------------------------------------------------- /src/main/java/com/venn/question/LateTps.java: -------------------------------------------------------------------------------- 1 | package com.venn.question; 2 | 3 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 4 | 5 | public class LateTps { 6 | 7 | public static void main(String[] args) { 8 | 9 | StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); 10 | env.setParallelism(1); 11 | 12 | 13 | 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/util/OverStockFlatMapFunction.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.util 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction 4 | import org.apache.flink.util.Collector 5 | 6 | /** 7 | * @Classname OverStockFlatMapFunction 8 | * @Description TODO 9 | * @Date 2023/6/13 10 | * @Created by venn 11 | */ 12 | class OverStockFlatMapFunction extends FlatMapFunction[String, String]{ 13 | override def flatMap(t: String, collector: Collector[String]): Unit = { 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/util/TwoStringSource.scala: -------------------------------------------------------------------------------- 1 | package com.venn.util 2 | 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction 4 | 5 | class TwoStringSource extends SourceFunction[String] { 6 | 7 | var flag = true 8 | 9 | override def cancel(): Unit = { 10 | 11 | flag = false 12 | } 13 | 14 | override def run(ctx: SourceFunction.SourceContext[String]): Unit = { 15 | 16 | while (flag) { 17 | val str = MathUtil.getRadomNum(1) 18 | ctx.collect(str + "," + StringUtil.getRandomString(1).toUpperCase) 19 | Thread.sleep(1000) 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/demo/CustomerSource.scala: -------------------------------------------------------------------------------- 1 | package com.venn.demo 2 | 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction 4 | 5 | 6 | class CustomerSource extends SourceFunction[Tuple2[Long,Long]]{ 7 | 8 | var count=1625048255867L 9 | var isRunning=true 10 | override def run(ctx: SourceFunction.SourceContext[Tuple2[Long,Long]]): Unit = { 11 | while(isRunning) { 12 | ctx.collect(new Tuple2(count,count)) 13 | count += 1000 14 | Thread.sleep(1000) 15 | } 16 | } 17 | 18 | override def cancel(): Unit = { 19 | 20 | isRunning=false 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/resources/cdc_demo.properties: -------------------------------------------------------------------------------- 1 | job_name=guest_task_result 2 | ## source mysql 3 | source.host:localhost 4 | source.port:3306 5 | source.user:root 6 | source.pass:123456 7 | source.database:operation 8 | source.table_list:operation.guest_task_result 9 | source.time_zone:Asia/Shanghai 10 | # init latest 11 | source.startup_option:latest 12 | source.startup_option_time:2024-03-07 00:00:00 13 | ## sink starrocks 14 | sink.jdbc-url=jdbc:mysql://localhost:9030 15 | sink.load-url=localhost:18030 16 | sink.jdbcPort = 9030 17 | sink.httpPort=18030 18 | sink.username=root 19 | sink.password=123456 20 | sink.database-name=test 21 | sink.table-name=guest_task_result 22 | sink.batch=64000 23 | sink.interval=5000 -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/util/StockCommon.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.util 2 | 3 | /** 4 | * @Classname MoveSaleCommon 5 | * @Description TODO 6 | * @Date 2023/6/8 7 | * @Created by venn 8 | */ 9 | object StockCommon { 10 | 11 | val MYSQL_HOST = "10.201.0.30" 12 | val MYSQL_PORT = 3316 13 | val MYSQL_USER = "root" 14 | val MYSQL_PASS = "R59JUZJ&dG" 15 | 16 | val KAFKA_BOOTSTRAT_SERVER = "localhost:9092" 17 | 18 | val OVERSTOCK = "ods_poc_k3_sal_outstock" 19 | val OVERSTOCK_DETAIL = "ods_poc_k3_sal_outstockentry" 20 | val STOCK_LIST = "ods_poc_sfa_distributoroutstocklist" 21 | val STOCK_LIST_DETAIL = "ods_poc_sfa_distributoroutstocklist_detail" 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/README.md: -------------------------------------------------------------------------------- 1 | # 库存场景计算 2 | 3 | ## 源表 4 | 5 | | 表名 | 内容 | 6 | | --- | --- | 7 | | ods_poc_dim_product_doc_api | dim_产品档案 | 8 | | ods_poc_dim_agency_doc_api | dim_经销商档案 | 9 | | ods_poc_dim_dpt_doc_api | dim_部门档案 | 10 | | ods_poc_k3_customer | k3_客户档案 | 11 | | ods_poc_k3_material | k3_物料档案 | 12 | | ods_poc_k3_sal_outstock | k3_销售出库单主表 | 13 | | ods_poc_k3_sal_outstockentry | k3_销售出库单子表 | 14 | | ods_poc_sfa_in_out | sfa_出入库类型 | 15 | | ods_poc_sfa_stocking | sfa_期初库存 | 16 | | ods_poc_sfa_distributoroutstocklist | sfa_经销商进出库扫码主表 | 17 | | ods_poc_sfa_distributoroutstocklist_detail | sfa_经销商进出库扫码子表 | 18 | | ods_poc_sfa_distributor_department | sfa_经销商部门对照关系 | 19 | 20 | 21 | 22 | ## 动销 23 | 业务逻辑: 非核心产品采购 + 核心产品销售 C002/C004 24 | 25 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/cust/ReadHttpWordCount.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.cust; 2 | 3 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 5 | 6 | public class ReadHttpWordCount { 7 | public static void main(String[] args) throws Exception { 8 | 9 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 10 | env.setParallelism(1); 11 | 12 | DataStreamSource source = env.addSource(new CustHttpSource("http://localhost:8888", 10)); 13 | 14 | source.map(item -> item) 15 | .keyBy(item -> "0") 16 | .max(0) 17 | .print(); 18 | env.execute("ReadHttpWordCount"); 19 | 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/DayBasePathBucketer.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.connector.filesink 2 | // 3 | //import java.io.File 4 | //import org.apache.flink.streaming.connectors.fs.Clock 5 | //import org.apache.flink.streaming.connectors.fs.bucketing.BasePathBucketer 6 | //import org.apache.hadoop.fs.Path 7 | // 8 | // 9 | ///** 10 | // * 根据实际数据返回数据输出的路径 11 | // */ 12 | //class DayBasePathBucketer extends BasePathBucketer[String]{ 13 | // 14 | // /** 15 | // * 返回路径 16 | // * @param clock 17 | // * @param basePath 18 | // * @param element 19 | // * @return 20 | // */ 21 | // override def getBucketPath(clock: Clock, basePath: Path, element: String): Path = { 22 | // // yyyyMMdd 23 | // val day = element.substring(1, 9) 24 | // new Path(basePath + File.separator + day) 25 | // } 26 | //} 27 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/util/MathUtil.java: -------------------------------------------------------------------------------- 1 | package com.venn.util; 2 | 3 | import java.util.Random; 4 | 5 | /** 6 | * Created by venn on 19-2-13. 7 | */ 8 | public class MathUtil { 9 | 10 | public static Random random = new Random(); 11 | public static int index =1; 12 | 13 | public static String getMediaCode(int i){ 14 | String mediacode = fitNum(i); 15 | 16 | return mediacode; 17 | } 18 | 19 | private static String fitNum(int num){ 20 | String str = String.valueOf(num); 21 | 22 | while (str.length() < 10){ 23 | str = "0"+str; 24 | } 25 | return str; 26 | } 27 | 28 | public static String getRadomNum(int num){ 29 | String tmp = ""; 30 | for (int i =0; i< num; i++){ 31 | tmp += random.nextInt(10); 32 | } 33 | 34 | return tmp; 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/processAndEvent/SimpleProcessFunction.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.processAndEvent 2 | 3 | import com.venn.question.retention.UserLog 4 | import com.venn.util.DateTimeUtil 5 | import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction 6 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 7 | import org.apache.flink.util.Collector 8 | 9 | /** 10 | * user day retention analyze process function 11 | */ 12 | class SimpleProcessFunction(time: String) extends ProcessWindowFunction[UserLog, String, String, TimeWindow] { 13 | override def process(key: String, context: Context, elements: Iterable[UserLog], out: Collector[String]): Unit = { 14 | 15 | val current = DateTimeUtil.formatMillis(System.currentTimeMillis(), DateTimeUtil.YYYY_MM_DD_HH_MM_SS) 16 | out.collect(current + "\t time trigger calc: " + time) 17 | 18 | } 19 | } -------------------------------------------------------------------------------- /src/main/scala/com/venn/demo/SlotPartitionDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.demo 2 | 3 | import com.venn.common.Common 4 | import org.apache.flink.api.common.serialization.SimpleStringSchema 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 6 | import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, FlinkKafkaProducer} 7 | import org.apache.flink.api.scala._ 8 | 9 | object SlotPartitionDemo { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 14 | val topic = "slot_partition" 15 | val source = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), Common.getProp) 16 | val sink = new FlinkKafkaProducer[String](topic+"_out", new SimpleStringSchema(), Common.getProp) 17 | 18 | env.setParallelism(2) 19 | env.addSource(source) 20 | .addSink(sink) 21 | 22 | 23 | env.execute(this.getClass.getName) 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/dynamicWindow/DataSourceFunction.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.dynamicWindow 2 | 3 | import java.util 4 | 5 | import com.google.gson.Gson 6 | import org.apache.flink.streaming.api.functions.source.SourceFunction 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * data source 12 | */ 13 | class DataSourceFunction extends SourceFunction[String] { 14 | 15 | var flag = true 16 | 17 | override def run(ctx: SourceFunction.SourceContext[String]): Unit = { 18 | 19 | var map = new util.HashMap[String, String] 20 | while (flag) { 21 | 22 | val random = new Random() 23 | val gson = new Gson() 24 | for (i <- 1 to 4) { 25 | 26 | map.put("attr", "attr" + i) 27 | map.put("value", "" + random.nextInt(1000)) 28 | map.put("time", "" + System.currentTimeMillis()) 29 | 30 | val json = gson.toJson(map) 31 | 32 | ctx.collect(json) 33 | } 34 | 35 | Thread.sleep(1000) 36 | 37 | } 38 | 39 | } 40 | 41 | override def cancel(): Unit = { 42 | flag = false 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/cep/cep.md: -------------------------------------------------------------------------------- 1 | # CEP Demo 2 | ```text 3 | this package for cep demo 4 | ``` 5 | 6 | ## References 7 | ```text 8 | 官网文档:https://ci.apache.org/projects/flink/flink-docs-release-1.9/dev/libs/cep.html 9 | 官网文档翻译:https://www.cnblogs.com/Springmoon-venn/p/11993468.html 10 | 刘博 Flink CEP 实战: 11 | PPT : https://files.alicdn.com/tpsservice/94d409d9679d1b46034f7d00161d99a7.pdf 12 | 视频 : https://www.bilibili.com/video/av66073054/ 13 | 刘博 Apache Flink CEP 实战 : https://mp.weixin.qq.com/s/4dQYr-RXKBRdrhu6Y5dZdw 14 | Flink-CEPplus 项目:https://github.com/ljygz/Flink-CEPplus (作者和 末日布孤单 应该是一个人) 15 | 末日布孤单源 CEP 码解析:https://www.cnblogs.com/ljygz/p/11978386.html 16 | ``` 17 | 18 | ## 匹配后跳过策略: 19 | ```text 20 | 模式: b+ c 21 | input : b1 b2 b3 c 22 | NO_SKIP : b1 b2 b3 c / b2 b3 c / b3 c # 一次只跳过一个事件,就开始匹配 23 | SKIP_TO_NEXT : b1 b2 b3 c / b2 b3 c / b3 c # 调到下一个 开始事件(也就是 b) 24 | SKIP_PAST_LAST_EVENT : b1 b2 b3 c # 跳过所有匹配过的事件 25 | SKIP_TO_FIRST[b] : b1 b2 b3 c / b2 b3 c / b3 c # 跳到第一个b(如果 第一个就是 b,从这个 b 后面的第一个b 开始)? 26 | SKIP_TO_LAST[b] : b1 b2 b3 c / b3 c # 跳到最后一个b,如果模式里面没有连续的b, 应该是调到 c 的后一个事件 ? 27 | ``` -------------------------------------------------------------------------------- /src/main/java/com/venn/entity/Behavior.java: -------------------------------------------------------------------------------- 1 | package com.venn.entity; 2 | 3 | /** 4 | * 点击流实体对象 5 | */ 6 | public class Behavior { 7 | private String userId; 8 | private String url; 9 | private long ts; 10 | 11 | public Behavior(String userId, String url, long ts) { 12 | this.userId = userId; 13 | this.url = url; 14 | this.ts = ts; 15 | } 16 | 17 | public String getUserId() { 18 | return userId; 19 | } 20 | 21 | public void setUserId(String userId) { 22 | this.userId = userId; 23 | } 24 | 25 | public String getUrl() { 26 | return url; 27 | } 28 | 29 | public void setUrl(String url) { 30 | this.url = url; 31 | } 32 | 33 | public long getTs() { 34 | return ts; 35 | } 36 | 37 | public void setTs(long ts) { 38 | this.ts = ts; 39 | } 40 | 41 | @Override 42 | public String toString() { 43 | return "Behavior{" + 44 | "userId='" + userId + '\'' + 45 | ", url='" + url + '\'' + 46 | ", ts=" + ts + 47 | '}'; 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/util/StringUtil.java: -------------------------------------------------------------------------------- 1 | package com.venn.util; 2 | 3 | /** 4 | * Created by venn on 19-2-13. 5 | */ 6 | public class StringUtil { 7 | 8 | public static String getRandomString(int len){ 9 | StringBuilder sb = new StringBuilder(); 10 | char tmp; 11 | for(int i=0; i< len; i++){ 12 | if(MathUtil.random.nextBoolean()){ 13 | tmp = (char)(MathUtil.random.nextInt(26) + 65); 14 | }else{ 15 | tmp = (char)(MathUtil.random.nextInt(26) + 97); 16 | } 17 | sb.append(tmp); 18 | } 19 | return sb.toString(); 20 | } 21 | 22 | public static String getRandomString(){ 23 | StringBuilder sb = new StringBuilder(); 24 | char tmp; 25 | for(int i=0; i<= 10; i++){ 26 | if(MathUtil.random.nextBoolean()){ 27 | tmp = (char)(MathUtil.random.nextInt(26) + 65); 28 | }else{ 29 | tmp = (char)(MathUtil.random.nextInt(26) + 97); 30 | } 31 | sb.append(tmp); 32 | } 33 | return sb.toString(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/intervalJoin/IntervalJoinProcessFunctionDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.intervalJoin 2 | 3 | import org.apache.flink.configuration.Configuration 4 | import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction 5 | import org.apache.flink.util.Collector 6 | 7 | /** 8 | * 9 | */ 10 | class IntervalJoinProcessFunctionDemo extends ProcessJoinFunction[IntervalUser, IntervalUser, IntervalUser] { 11 | 12 | override def open(parameters: Configuration): Unit = { 13 | 14 | } 15 | 16 | 17 | override def processElement(left: IntervalUser, 18 | right: IntervalUser, 19 | ctx: ProcessJoinFunction[IntervalUser, IntervalUser, IntervalUser]#Context, 20 | out: Collector[IntervalUser]): Unit = { 21 | 22 | // println("left timestamp : " + ctx.getLeftTimestamp) 23 | // println("right timestamp : " + ctx.getRightTimestamp) 24 | 25 | out.collect(IntervalUser(left.id , left.name, right.phone, (left.date + "-" + right.date))) 26 | 27 | } 28 | 29 | 30 | 31 | override def close(): Unit = { 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/MysqlData.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | import java.sql.DriverManager; 4 | import java.sql.PreparedStatement; 5 | import java.sql.SQLException; 6 | 7 | public class MysqlData { 8 | 9 | private static String jdbcUrl = "jdbc:mysql://192.168.229.128:3306?useSSL=false&allowPublicKeyRetrieval=true"; 10 | private static String username = "root"; 11 | private static String password = "123456"; 12 | private static String driverName = "com.mysql.jdbc.Driver"; 13 | 14 | 15 | public static void main(String[] args) throws ClassNotFoundException, SQLException { 16 | 17 | java.sql.Connection conn; 18 | PreparedStatement ps; 19 | 20 | Class.forName(driverName); 21 | conn = DriverManager.getConnection(jdbcUrl, username, password); 22 | ps = conn.prepareStatement("insert into async.async_test(id, phone) values (?, ?)"); 23 | 24 | for (int i = 100000; i < 1000000; i++){ 25 | ps.setString(1, "" + i); 26 | ps.setString(2, "" + System.currentTimeMillis()); 27 | 28 | ps.execute(); 29 | // conn.commit(); 30 | } 31 | 32 | 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/jdbcOutput/MysqlOutputMaker.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.jdbcOutput 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import com.venn.common.Common 6 | import com.venn.util.{MathUtil, StringUtil} 7 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 8 | 9 | /** 10 | * test data maker 11 | */ 12 | 13 | object MysqlOutputMaker { 14 | val topic = "async" 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | while (true) { 19 | 20 | left("mysql_output") 21 | Thread.sleep(100) 22 | } 23 | } 24 | 25 | val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 26 | 27 | var id = 0 28 | 29 | def left(topic: String) = { 30 | val producer = new KafkaProducer[String, String](Common.getProp) 31 | id = id + 1 32 | val username = StringUtil.getRandomString(5) 33 | val password = StringUtil.getRandomString(10) 34 | val sex = MathUtil.random.nextInt(2) 35 | val phone = MathUtil.getRadomNum(11) 36 | 37 | val message = username + "," + password + "," + sex + "," + phone 38 | 39 | val msg = new ProducerRecord[String, String](topic, message) 40 | producer.send(msg) 41 | producer.flush() 42 | println("send : " + message) 43 | } 44 | 45 | 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/main/resources/sql/kafkaJsonSourceSinkDemo.sql: -------------------------------------------------------------------------------- 1 | --sourceTable 2 | CREATE TABLE user_log( 3 | user_id VARCHAR, 4 | item_id VARCHAR, 5 | category_id VARCHAR, 6 | behavior VARCHAR, 7 | ts TIMESTAMP(3) 8 | ) WITH ( 9 | 'connector.type' = 'kafka', 10 | 'connector.version' = 'universal', 11 | 'connector.topic' = 'user_behavior', 12 | 'connector.properties.zookeeper.connect' = 'venn:2181', 13 | 'connector.properties.bootstrap.servers' = 'venn:9092', 14 | 'connector.startup-mode' = 'earliest-offset', 15 | 'format.type' = 'json' 16 | # 'format.type' = 'csv' 17 | ); 18 | 19 | --sinkTable 20 | CREATE TABLE user_log_sink ( 21 | user_id VARCHAR, 22 | item_id VARCHAR, 23 | category_id VARCHAR, 24 | behavior VARCHAR, 25 | ts TIMESTAMP(3) 26 | ) WITH ( 27 | 'connector.type' = 'kafka', 28 | 'connector.version' = 'universal', 29 | 'connector.topic' = 'user_behavior_sink', 30 | 'connector.properties.zookeeper.connect' = 'venn:2181', 31 | 'connector.properties.bootstrap.servers' = 'venn:9092', 32 | 'update-mode' = 'append', 33 | # 'format.type' = 'json' 34 | 'format.type' = 'csv' 35 | ); 36 | 37 | --insert 38 | INSERT INTO user_log_sink 39 | SELECT user_id, item_id, category_id, behavior, ts 40 | FROM user_log; 41 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | log4j.rootLogger=info, console 20 | 21 | log4j.appender.console=org.apache.log4j.ConsoleAppender 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n 24 | 25 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/starrocks/Column.java: -------------------------------------------------------------------------------- 1 | package com.venn.connector.starrocks; 2 | 3 | /** 4 | * @Classname Column 5 | * @Description TODO 6 | * @Date 2024/3/8 7 | * @Created by venn 8 | */ 9 | public class Column { 10 | 11 | private String name; 12 | private String type; 13 | private String comment; 14 | 15 | public Column() { 16 | } 17 | 18 | public Column(String name, String type, String comment) { 19 | this.name = name; 20 | this.type = type; 21 | this.comment = comment; 22 | } 23 | 24 | public String getName() { 25 | return name; 26 | } 27 | 28 | public void setName(String name) { 29 | this.name = name; 30 | } 31 | 32 | public String getType() { 33 | return type; 34 | } 35 | 36 | public void setType(String type) { 37 | this.type = type; 38 | } 39 | 40 | public String getComment() { 41 | return comment; 42 | } 43 | 44 | public void setComment(String comment) { 45 | this.comment = comment; 46 | } 47 | 48 | @Override 49 | public String toString() { 50 | return "Column{" + 51 | "name='" + name + '\'' + 52 | ", type='" + type + '\'' + 53 | ", comment='" + comment + '\'' + 54 | '}'; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/resources/sql/sqlDemo.sql: -------------------------------------------------------------------------------- 1 | --sourceTable 2 | CREATE TABLE user_log ( 3 | user_id VARCHAR, 4 | item_id VARCHAR, 5 | category_id VARCHAR, 6 | behavior VARCHAR, 7 | ts TIMESTAMP(3) 8 | ) WITH ( 9 | 'connector.type' = 'kafka', 10 | 'connector.version' = 'universal', 11 | 'connector.topic' = 'user_behavior', 12 | 'connector.startup-mode' = 'earliest-offset', 13 | 'connector.properties.0.key' = 'zookeeper.connect', 14 | 'connector.properties.0.value' = 'venn:2181', 15 | 'connector.properties.1.key' = 'bootstrap.servers', 16 | 'connector.properties.1.value' = 'venn:9092', 17 | 'update-mode' = 'append', 18 | 'format.type' = 'json', 19 | 'format.derive-schema' = 'true' 20 | ); 21 | 22 | --sinkTable 23 | CREATE TABLE pvuv_sink ( 24 | dt VARCHAR, 25 | pv BIGINT, 26 | uv BIGINT 27 | ) WITH ( 28 | 'connector.type' = 'jdbc', 29 | 'connector.url' = 'jdbc:mysql://venn:3306/venn', 30 | 'connector.table' = 'pvuv_sink', 31 | 'connector.username' = 'root', 32 | 'connector.password' = '123456', 33 | 'connector.write.flush.max-rows' = '1' 34 | ); 35 | 36 | --insert 37 | INSERT INTO pvuv_sink(dt, pv, uv) 38 | SELECT 39 | DATE_FORMAT(ts, 'yyyy-MM-dd HH:00') dt, 40 | COUNT(*) AS pv, 41 | COUNT(DISTINCT user_id) AS uv 42 | FROM user_log 43 | GROUP BY DATE_FORMAT(ts, 'yyyy-MM-dd HH:00'); 44 | -------------------------------------------------------------------------------- /src/main/test/com/venn/connector/kafka/KafkaOffsetRevertTest.scala: -------------------------------------------------------------------------------- 1 | package com.venn.kafka 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.{Calendar, Date} 5 | 6 | import com.venn.common.Common 7 | import com.venn.util.MathUtil 8 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 9 | 10 | import scala.util.parsing.json.JSONObject 11 | 12 | /** 13 | * test data maker 14 | */ 15 | 16 | object CurrentDayMaker { 17 | 18 | 19 | /** 20 | * kafka offset revert test 21 | * kafka offset 回退测试 22 | * 23 | * @return 24 | */ 25 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") 26 | 27 | def main(args: Array[String]): Unit = { 28 | val producer = new KafkaProducer[String, String](Common.getProp()) 29 | var i = 0; 30 | while (true) { 31 | 32 | // val map = Map("id"-> i, "createTime"-> sdf.format(System.currentTimeMillis())) 33 | val map = Map("id" -> i, "createTime" -> sdf.format(System.currentTimeMillis()), "amt" -> (MathUtil.random.nextInt(10) + "." + MathUtil.random.nextInt(10))) 34 | val jsonObject: JSONObject = new JSONObject(map) 35 | println(jsonObject.toString()) 36 | // topic current_day 37 | val msg = new ProducerRecord[String, String]("kafka_offset", jsonObject.toString()) 38 | producer.send(msg) 39 | producer.flush() 40 | Thread.sleep(1000) 41 | i = i + 1 42 | // System.exit(-1) 43 | } 44 | } 45 | 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/FileSinkMaker.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.connector.filesink 2 | // 3 | //import java.text.SimpleDateFormat 4 | //import java.util.Calendar 5 | //import com.venn.common.Common 6 | //import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 7 | // 8 | // 9 | ///** 10 | // * test data maker 11 | // */ 12 | // 13 | //object FileSinkMaker { 14 | // val topic = "async" 15 | // 16 | // def main(args: Array[String]): Unit = { 17 | // 18 | // while (true) { 19 | // 20 | // left("roll_file_sink") 21 | // Thread.sleep(100) 22 | // } 23 | // } 24 | // 25 | // val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 26 | // 27 | // var idLeft = 0 28 | // 29 | // def left(topic: String) = { 30 | // val producer = new KafkaProducer[String, String](Common.getProp) 31 | // idLeft = idLeft + 1 32 | // val map = Map("id" -> idLeft, "name" -> ("venn" + System.currentTimeMillis()), "date" -> getCreateTime) 33 | // val jsonObject: JSONObject = new JSONObject(map) 34 | // println("left : " + jsonObject.toString()) 35 | // val msg = new ProducerRecord[String, String](topic, jsonObject.toString()) 36 | //// producer.send(msg) 37 | //// producer.flush() 38 | // } 39 | // 40 | // var minute : Int = 1 41 | // val calendar: Calendar = Calendar.getInstance() 42 | // def getCreateTime(): String = { 43 | // // minute = minute + 1 44 | // calendar.add(Calendar.MINUTE, 10) 45 | // sdf.format(calendar.getTime) 46 | // } 47 | // 48 | //} 49 | // 50 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/cust/CustHttpSource.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.cust; 2 | 3 | import com.venn.util.HttpClientUtil; 4 | import org.apache.flink.configuration.Configuration; 5 | import org.apache.flink.metrics.Counter; 6 | import org.apache.flink.metrics.SimpleCounter; 7 | import org.apache.flink.streaming.api.functions.source.RichSourceFunction; 8 | 9 | public class CustHttpSource extends RichSourceFunction { 10 | 11 | private String url; 12 | private long requestInterval; 13 | private boolean flag = false; 14 | private transient Counter counter; 15 | 16 | public CustHttpSource(String url, long requestInterval) { 17 | this.url = url; 18 | this.requestInterval = requestInterval; 19 | } 20 | 21 | @Override 22 | public void open(Configuration parameters) throws Exception { 23 | flag = true; 24 | 25 | counter = new SimpleCounter(); 26 | this.counter = getRuntimeContext() 27 | .getMetricGroup() 28 | .counter("myCounter"); 29 | 30 | } 31 | 32 | @Override 33 | public void run(SourceContext ctx) throws Exception { 34 | 35 | 36 | while (true) { 37 | String result = HttpClientUtil.doGet(url); 38 | 39 | ctx.collect(result); 40 | this.counter.inc(); 41 | 42 | Thread.sleep(requestInterval); 43 | } 44 | 45 | } 46 | 47 | @Override 48 | public void cancel() { 49 | flag = false; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/AsyncUser.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | public class AsyncUser { 4 | 5 | private String id; 6 | private String username; 7 | private String password; 8 | private String phone; 9 | 10 | public AsyncUser() { 11 | } 12 | 13 | public AsyncUser(String id, String username, String password) { 14 | this.id = id; 15 | this.username = username; 16 | this.password = password; 17 | } 18 | 19 | public String getPhone() { 20 | return phone; 21 | } 22 | 23 | public void setPhone(String phone) { 24 | this.phone = phone; 25 | } 26 | 27 | public String getId() { 28 | return id; 29 | } 30 | 31 | public void setId(String id) { 32 | this.id = id; 33 | } 34 | 35 | public String getUsername() { 36 | return username; 37 | } 38 | 39 | public void setUsername(String username) { 40 | this.username = username; 41 | } 42 | 43 | public String getPassword() { 44 | return password; 45 | } 46 | 47 | public void setPassword(String password) { 48 | this.password = password; 49 | } 50 | 51 | @Override 52 | public String toString() { 53 | return "AsyncUser{" + 54 | "id='" + id + '\'' + 55 | ", username='" + username + '\'' + 56 | ", password='" + password + '\'' + 57 | ", phone='" + phone + '\'' + 58 | '}'; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/cdcStarrocks/CdcRecord.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.cdcStarrocks; 2 | 3 | import java.util.LinkedHashMap; 4 | import java.util.Map; 5 | 6 | /** 7 | * cdcRecord save 8 | */ 9 | public class CdcRecord { 10 | 11 | private String db; 12 | private String table; 13 | private String op; 14 | private Map data = new LinkedHashMap<>(); 15 | 16 | public CdcRecord(String db, String table, String op) { 17 | this.db = db; 18 | this.table = table; 19 | this.op = op; 20 | } 21 | 22 | public String getDb() { 23 | return db; 24 | } 25 | 26 | public void setDb(String db) { 27 | this.db = db; 28 | } 29 | 30 | public String getTable() { 31 | return table; 32 | } 33 | 34 | public void setTable(String table) { 35 | this.table = table; 36 | } 37 | 38 | public String getOp() { 39 | return op; 40 | } 41 | 42 | public void setOp(String op) { 43 | this.op = op; 44 | } 45 | 46 | public Map getData() { 47 | return data; 48 | } 49 | 50 | public void setData(Map data) { 51 | this.data = data; 52 | } 53 | 54 | @Override 55 | public String toString() { 56 | return "CdcRecord{" + 57 | "db='" + db + '\'' + 58 | ", table='" + table + '\'' + 59 | ", op='" + op + '\'' + 60 | ", data=" + data + 61 | '}'; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/demo/FilterTest.scala: -------------------------------------------------------------------------------- 1 | package com.venn.demo 2 | 3 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 4 | import org.apache.flink.api.common.serialization.SimpleStringSchema 5 | import org.apache.flink.connector.kafka.source.KafkaSource 6 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 7 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 8 | import org.apache.flink.api.scala._ 9 | 10 | import java.util.regex.Pattern 11 | import scala.util.Random 12 | 13 | object FilterTest { 14 | 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | 19 | val env = StreamExecutionEnvironment.getExecutionEnvironment 20 | env.setParallelism(1) 21 | val topic = "filter_test" 22 | 23 | val random = new Random(); 24 | print(random.nextString(16)) 25 | 26 | val bootstrapServer = "localhost:9092" 27 | val kafkaSource = KafkaSource.builder[String]() 28 | .setBootstrapServers(bootstrapServer) 29 | .setTopicPattern(Pattern.compile(topic)) 30 | .setGroupId("day_window") 31 | // .setStartingOffsets(OffsetsInitializer.committedOffsets()) 32 | .setStartingOffsets(OffsetsInitializer.latest()) 33 | .setValueOnlyDeserializer(new SimpleStringSchema()) 34 | .build() 35 | 36 | 37 | env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSoruce1") 38 | .filter(str => str.equals("abc")) 39 | .print(">>") 40 | 41 | env.execute("exec") 42 | 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/venn/entity/StreamElement.java: -------------------------------------------------------------------------------- 1 | package com.venn.entity; 2 | 3 | /** 4 | * 流实例基类 5 | */ 6 | public final class StreamElement { 7 | 8 | public String source; 9 | public long ingestionTime; 10 | public String db; 11 | public String table; 12 | public T data; 13 | 14 | public StreamElement(T data, long ingestionTime) { 15 | this.data = data; 16 | this.ingestionTime = ingestionTime; 17 | } 18 | 19 | public String getSource() { 20 | return source; 21 | } 22 | 23 | public void setSource(String source) { 24 | this.source = source; 25 | } 26 | 27 | public Long getIngestionTime() { 28 | return ingestionTime; 29 | } 30 | 31 | public String getDb() { 32 | return db; 33 | } 34 | 35 | public void setDb(String db) { 36 | this.db = db; 37 | } 38 | 39 | public String getTable() { 40 | return table; 41 | } 42 | 43 | public void setTable(String table) { 44 | this.table = table; 45 | } 46 | 47 | public T getData() { 48 | return data; 49 | } 50 | 51 | public void setData(T data) { 52 | this.data = data; 53 | } 54 | 55 | @Override 56 | public String toString() { 57 | return "StreamElement{" + 58 | "source='" + source + '\'' + 59 | ", ingestionTime=" + ingestionTime + 60 | ", db='" + db + '\'' + 61 | ", table='" + table + '\'' + 62 | ", data=" + data + 63 | '}'; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/cdc/CdcDdlTest.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.cdc 2 | 3 | import com.venn.source.mysql.cdc.CommonStringDebeziumDeserializationSchema 4 | import com.ververica.cdc.connectors.mysql.source.MySqlSource 5 | import com.ververica.cdc.connectors.mysql.table.StartupOptions 6 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 7 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 8 | import org.apache.flink.api.scala._ 9 | 10 | import java.util.TimeZone 11 | 12 | /** 13 | * @Classname CdcDdlTest 14 | * @Description TODO 15 | * @Date 2023/8/31 16 | * @Created by venn 17 | */ 18 | object CdcDdlTest { 19 | 20 | def main(args: Array[String]): Unit = { 21 | 22 | val env = StreamExecutionEnvironment.getExecutionEnvironment 23 | 24 | // cdc source 25 | val source = MySqlSource.builder[String]() 26 | .hostname("rm-2ze0qoq964s4nnodi.mysql.rds.aliyuncs.com") 27 | .port(3306) 28 | .username("daas") 29 | .password("Dass@2021") 30 | .databaseList("dct3_0") 31 | .tableList("dct3_0.*") 32 | .serverTimeZone("Asia/Shanghai") 33 | // 包含 schema change 34 | .includeSchemaChanges(true) 35 | .startupOptions(StartupOptions.latest()) 36 | .deserializer(new DdlDebeziumDeserializationSchema("", 3306)) 37 | .build() 38 | 39 | 40 | 41 | env.setParallelism(1) 42 | env.fromSource(source, WatermarkStrategy.noWatermarks[String](), "cdc") 43 | .map((str: String) => str) 44 | .print() 45 | 46 | 47 | env.execute("CdcDdlTest") 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/starrocks/TableSchema.java: -------------------------------------------------------------------------------- 1 | package com.venn.connector.starrocks; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * @Classname TableSchema 8 | * @Description TODO 9 | * @Date 2024/3/8 10 | * @Created by venn 11 | */ 12 | public class TableSchema { 13 | 14 | private String tableName; 15 | private String tableComment; 16 | private List column = new ArrayList<>(); 17 | 18 | public TableSchema() { 19 | } 20 | 21 | public TableSchema(String tableName, String tableComment, List column) { 22 | this.tableName = tableName; 23 | this.tableComment = tableComment; 24 | this.column = column; 25 | } 26 | 27 | public String getTableName() { 28 | return tableName; 29 | } 30 | 31 | public void setTableName(String tableName) { 32 | this.tableName = tableName; 33 | } 34 | 35 | public String getTableComment() { 36 | return tableComment; 37 | } 38 | 39 | public void setTableComment(String tableComment) { 40 | this.tableComment = tableComment; 41 | } 42 | 43 | public List getColumn() { 44 | return column; 45 | } 46 | 47 | public void setColumn(List column) { 48 | this.column = column; 49 | } 50 | 51 | @Override 52 | public String toString() { 53 | return "TableSchema{" + 54 | "tableName='" + tableName + '\'' + 55 | ", tableComment='" + tableComment + '\'' + 56 | ", column=" + column + 57 | '}'; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/cust/HttpServer.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.cust; 2 | 3 | import com.sun.net.httpserver.HttpExchange; 4 | import com.sun.net.httpserver.HttpHandler; 5 | import org.apache.commons.io.IOUtils; 6 | 7 | import java.io.IOException; 8 | import java.io.OutputStream; 9 | import java.net.InetSocketAddress; 10 | import java.util.UUID; 11 | 12 | /** 13 | * 创建 http server 监控端口请求 14 | */ 15 | public class HttpServer { 16 | 17 | public static void main(String[] arg) throws Exception { 18 | 19 | com.sun.net.httpserver.HttpServer server = com.sun.net.httpserver.HttpServer.create(new InetSocketAddress(8888), 10); 20 | server.createContext("/", new TestHandler()); 21 | server.start(); 22 | } 23 | 24 | static class TestHandler implements HttpHandler { 25 | public void handle(HttpExchange exchange) throws IOException { 26 | String response = "hello world"; 27 | 28 | try { 29 | //获得表单提交数据(post) 30 | String postString = IOUtils.toString(exchange.getRequestBody()); 31 | 32 | exchange.sendResponseHeaders(200, 0); 33 | OutputStream os = exchange.getResponseBody(); 34 | String result = UUID.randomUUID().toString(); 35 | result = System.currentTimeMillis() + ",name," + result; 36 | os.write(result.getBytes()); 37 | os.close(); 38 | } catch (IOException ie) { 39 | ie.printStackTrace(); 40 | } catch (Exception e) { 41 | e.printStackTrace(); 42 | } 43 | } 44 | } 45 | 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/kafka/kafkaToKafkaGroup.sql: -------------------------------------------------------------------------------- 1 | -- 读 json,写csv 2 | ---sourceTable 3 | CREATE TABLE user_log( 4 | user_id VARCHAR, 5 | item_id VARCHAR, 6 | category_id VARCHAR, 7 | behavior VARCHAR, 8 | ts TIMESTAMP(3), 9 | proctime as PROCTIME() 10 | ) WITH ( 11 | 'connector.type' = 'kafka', 12 | 'connector.version' = 'universal', 13 | 'connector.topic' = 'user_behavior', 14 | 'connector.properties.zookeeper.connect' = 'venn:2181', 15 | 'connector.properties.bootstrap.servers' = 'venn:9092', 16 | 'connector.startup-mode' = 'earliest-offset', 17 | 'update-mode' = 'upsert', 18 | 'format.type' = 'json' 19 | ); 20 | 21 | ---sinkTable 22 | CREATE TABLE user_log_sink ( 23 | item_id VARCHAR , 24 | category_id VARCHAR , 25 | behavior VARCHAR , 26 | max_tx TIMESTAMP(3), 27 | min_prc TIMESTAMP(3), 28 | max_prc TIMESTAMP(3), 29 | coun BIGINT 30 | ) WITH ( 31 | 'connector.type' = 'myKafka', 32 | 'connector.version' = 'universal', 33 | 'connector.topic' = 'user_behavior_sink', 34 | 'connector.properties.zookeeper.connect' = 'venn:2181', 35 | 'connector.properties.bootstrap.servers' = 'venn:9092', 36 | 'update-mode' = 'upsert', 37 | 'format.type' = 'json' 38 | ); 39 | 40 | ---insert 41 | INSERT INTO user_log_sink 42 | SELECT item_id, category_id, behavior, max(ts), min(proctime), max(proctime), count(user_id) 43 | FROM user_log 44 | group by item_id, category_id, behavior; 45 | -- SELECT item_id, category_id, behavior, max(ts), min(proctime), max(proctime), count(user_id) 46 | -- from user_log 47 | -- group by TUMBLE(proctime, INTERVAL '1' MINUTE ), item_id,category_id,behavior; -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/DayBucketAssigner.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.filesink 2 | 3 | import java.io.IOException 4 | import java.nio.charset.StandardCharsets 5 | import org.apache.flink.core.io.SimpleVersionedSerializer 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 7 | import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner 8 | 9 | class DayBucketAssigner extends BucketAssigner[ObjectNode, String] { 10 | 11 | /** 12 | * bucketId is the output path 13 | * @param element 14 | * @param context 15 | * @return 16 | */ 17 | override def getBucketId(element: ObjectNode, context: BucketAssigner.Context): String = { 18 | //context.currentProcessingTime() 19 | val day = element.get("date").asText("19790101000000").substring(0, 8) 20 | // wrap can day + "/" + xxx 21 | day 22 | } 23 | 24 | override def getSerializer: SimpleVersionedSerializer[String] = { 25 | 26 | StringSerializer 27 | } 28 | 29 | /** 30 | * 实现参考 : org.apache.flink.runtime.checkpoint.StringSerializer 31 | */ 32 | object StringSerializer extends SimpleVersionedSerializer[String] { 33 | val VERSION = 77 34 | 35 | override def getVersion = 77 36 | 37 | @throws[IOException] 38 | override def serialize(checkpointData: String): Array[Byte] = checkpointData.getBytes(StandardCharsets.UTF_8) 39 | 40 | @throws[IOException] 41 | override def deserialize(version: Int, serialized: Array[Byte]): String = if (version != 77) throw new IOException("version mismatch") 42 | else new String(serialized, StandardCharsets.UTF_8) 43 | } 44 | 45 | 46 | } 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/dayWindow/CurrentDayMaker.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.dayWindow 2 | 3 | import com.google.gson.GsonBuilder 4 | 5 | import java.text.SimpleDateFormat 6 | import java.util.{Calendar, Date} 7 | import com.venn.common.Common 8 | import com.venn.util.MathUtil 9 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 10 | 11 | 12 | /** 13 | * test data maker 14 | */ 15 | 16 | object CurrentDayMaker { 17 | 18 | 19 | var minute : Int = 1 20 | val calendar: Calendar = Calendar.getInstance() 21 | 22 | /** 23 | * 一天时间比较长,不方便观察,将时间改为当前时间, 24 | * 每次累加10分钟,这样一天只需要144次循环,也就是144秒 25 | * @return 26 | */ 27 | def getCreateTime(): String = { 28 | // minute = minute + 1 29 | calendar.add(Calendar.MINUTE, 10) 30 | sdf.format(calendar.getTime) 31 | } 32 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") 33 | 34 | def main(args: Array[String]): Unit = { 35 | val producer = new KafkaProducer[String, String](Common.getProp) 36 | calendar.setTime(new Date()) 37 | println(sdf.format(calendar.getTime)) 38 | var i =0; 39 | while (true) { 40 | 41 | // val map = Map("id"-> i, "createTime"-> sdf.format(System.currentTimeMillis())) 42 | val map = Map("id"-> i, "createTime"-> getCreateTime(), "amt"-> (MathUtil.random.nextInt(10) +"." + MathUtil.random.nextInt(10))) 43 | val gson = new GsonBuilder().create(); 44 | gson.toJson(map); 45 | // topic current_day 46 | val msg = new ProducerRecord[String, String]("current_day", gson.toString()) 47 | producer.send(msg) 48 | producer.flush() 49 | Thread.sleep(1000) 50 | i = i + 1 51 | // System.exit(-1) 52 | } 53 | } 54 | 55 | } 56 | 57 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/DayBulkWriter.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.filesink 2 | 3 | import java.io.File 4 | import java.nio.charset.StandardCharsets 5 | import org.apache.flink.api.common.serialization.BulkWriter 6 | import org.apache.flink.core.fs.FSDataOutputStream 7 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 8 | import org.apache.flink.util.Preconditions 9 | 10 | /** 11 | * 实现参考 : org.apache.flink.streaming.api.functions.sink.filesystem.BulkWriterTest 12 | */ 13 | class DayBulkWriter extends BulkWriter[ObjectNode] { 14 | 15 | val charset = StandardCharsets.UTF_8 16 | var stream: FSDataOutputStream = _ 17 | 18 | def DayBulkWriter(inputStream: FSDataOutputStream): DayBulkWriter = { 19 | stream = Preconditions.checkNotNull(inputStream); 20 | this 21 | } 22 | 23 | /** 24 | * write element 25 | * 26 | * @param element 27 | */ 28 | override def addElement(element: ObjectNode): Unit = { 29 | this.stream.write(element.toString.getBytes(charset)) 30 | // wrap 31 | this.stream.write('\n') 32 | 33 | } 34 | 35 | override def flush(): Unit = { 36 | this.stream.flush() 37 | } 38 | 39 | /** 40 | * output stream is input parameter, just flush, close is factory's job 41 | */ 42 | override def finish(): Unit = { 43 | this.flush() 44 | } 45 | 46 | } 47 | 48 | /** 49 | * 实现参考 : org.apache.flink.streaming.api.functions.sink.filesystem.BulkWriterTest.TestBulkWriterFactory 50 | */ 51 | class DayBulkWriterFactory extends BulkWriter.Factory[ObjectNode] { 52 | override def create(out: FSDataOutputStream): BulkWriter[ObjectNode] = { 53 | val dayBulkWriter = new DayBulkWriter 54 | dayBulkWriter.DayBulkWriter(out) 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/AsyncHbaseRequest.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | import com.google.gson.Gson; 4 | import com.venn.common.Common; 5 | import org.apache.flink.formats.json.JsonNodeDeserializationSchema; 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 7 | import org.apache.flink.streaming.api.datastream.AsyncDataStream; 8 | import org.apache.flink.streaming.api.datastream.DataStream; 9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 10 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 11 | 12 | import java.util.concurrent.TimeUnit; 13 | 14 | 15 | public class AsyncHbaseRequest { 16 | 17 | public static void main(String[] args) throws Exception { 18 | 19 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 20 | FlinkKafkaConsumer source = new FlinkKafkaConsumer<>("async", new JsonNodeDeserializationSchema(), Common.getProp()); 21 | 22 | // 接收kafka数据,转为User 对象 23 | DataStream input = env.addSource(source).map(value -> { 24 | String id = value.get("id").asText(); 25 | String username = value.get("username").asText(); 26 | String password = value.get("password").asText(); 27 | 28 | return new AsyncUser(id, username, password); 29 | }); 30 | // 异步IO 获取hbase, timeout 时间 1s,容量 100(超过100个请求,会反压上游节点) 31 | DataStream async = AsyncDataStream.unorderedWait(input, new AsyncFunctionForHbaseJava(), 1000, TimeUnit.MICROSECONDS, 100); 32 | 33 | async.map(user -> { 34 | 35 | return new Gson().toJson(user).toString(); 36 | }) 37 | .print(); 38 | 39 | env.execute("asyncForHbase"); 40 | 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/jdbcOutput/MysqlOutputDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.jdbcOutput 2 | 3 | import java.io.File 4 | 5 | import com.venn.common.Common 6 | import org.apache.flink.api.common.serialization.SimpleStringSchema 7 | import org.apache.flink.api.scala._ 8 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 9 | import org.apache.flink.streaming.api.functions.ProcessFunction 10 | import org.apache.flink.streaming.api.scala.{OutputTag, StreamExecutionEnvironment} 11 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 12 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 13 | import org.apache.flink.util.Collector 14 | 15 | /** 16 | * 侧边输出:This operation can be useful when you want to split a stream of data 17 | */ 18 | object MysqlOutputDemo { 19 | 20 | def main(args: Array[String]): Unit = { 21 | val env = StreamExecutionEnvironment.getExecutionEnvironment 22 | env.setParallelism(1) 23 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 24 | if ("/".equals(File.separator)) { 25 | val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 26 | env.setStateBackend(backend) 27 | env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 28 | } else { 29 | env.setMaxParallelism(1) 30 | env.setParallelism(1) 31 | } 32 | 33 | val source = new FlinkKafkaConsumer[String]("mysql_output", new SimpleStringSchema, Common.getProp) 34 | source.setStartFromLatest() 35 | env.addSource(source) 36 | .map(li => { 37 | val tmp = li.split(",") 38 | new User(tmp(0), tmp(1), tmp(2) toInt, tmp(3)) 39 | }) 40 | // .addSink(new MysqlSink1) 41 | .writeUsingOutputFormat(new MysqlSink1) 42 | 43 | env.execute("msqlOutput") 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/intervalJoin/IntervalJoinKafkaKeyMaker.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.intervalJoin 2 | 3 | import com.google.gson.{Gson, GsonBuilder, JsonObject} 4 | 5 | import java.text.SimpleDateFormat 6 | import com.venn.common.Common 7 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 8 | 9 | 10 | /** 11 | * test data maker 12 | */ 13 | 14 | object IntervalJoinKafkaKeyMaker { 15 | val topic = "async" 16 | 17 | def main(args: Array[String]): Unit = { 18 | 19 | while (true) { 20 | 21 | left("topic_left") 22 | right("topic_right") 23 | Thread.sleep(500) 24 | } 25 | } 26 | 27 | val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 28 | 29 | var idLeft = 0 30 | 31 | def left(topic: String) = { 32 | val producer = new KafkaProducer[String, String](Common.getProp) 33 | idLeft = idLeft + 1 34 | val map = Map("id" -> idLeft, "name" -> ("venn" + System.currentTimeMillis()), "date" -> sdf.format(System.currentTimeMillis())) 35 | val gson = new GsonBuilder().create(); 36 | gson.toJson(map); 37 | 38 | println("left : " + gson.toString()) 39 | val msg = new ProducerRecord[String, String](topic, gson.toString()) 40 | producer.send(msg) 41 | producer.flush() 42 | } 43 | 44 | var idRight = 0 45 | 46 | def right(topic: String) = { 47 | val producer = new KafkaProducer[String, String](Common.getProp) 48 | idRight = idRight + 1 49 | val map = Map("id" -> idRight, "phone" -> ("17713333333" + idRight), "date" -> sdf.format(System.currentTimeMillis())) 50 | val gson = new GsonBuilder().create(); 51 | gson.toJson(map); 52 | println("right : \t\t\t\t\t\t\t\t" + gson.toString()) 53 | val msg = new ProducerRecord[String, String](topic, gson.toString()) 54 | producer.send(msg) 55 | producer.flush() 56 | } 57 | 58 | } 59 | 60 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/jdbcOutput/MysqlSink1.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.jdbcOutput 2 | 3 | import java.sql.{Connection, DriverManager, PreparedStatement, SQLException} 4 | import org.apache.flink.api.common.io.OutputFormat 5 | import org.apache.flink.configuration.Configuration 6 | import org.slf4j.{Logger, LoggerFactory} 7 | 8 | class MysqlSink1 extends OutputFormat[User]{ 9 | 10 | val logger: Logger = LoggerFactory.getLogger("MysqlSink1") 11 | var conn: Connection = _ 12 | var ps: PreparedStatement = _ 13 | val jdbcUrl = "jdbc:mysql://192.168.229.128:3306?useSSL=false&allowPublicKeyRetrieval=true" 14 | val username = "root" 15 | val password = "123456" 16 | val driverName = "com.mysql.jdbc.Driver" 17 | 18 | override def configure(parameters: Configuration): Unit = { 19 | // not need 20 | } 21 | 22 | override def open(taskNumber: Int, numTasks: Int): Unit = { 23 | Class.forName(driverName) 24 | try { 25 | Class.forName(driverName) 26 | conn = DriverManager.getConnection(jdbcUrl, username, password) 27 | 28 | // close auto commit 29 | conn.setAutoCommit(false) 30 | } catch { 31 | case e@(_: ClassNotFoundException | _: SQLException) => 32 | logger.error("init mysql error") 33 | e.printStackTrace() 34 | System.exit(-1); 35 | } 36 | } 37 | 38 | override def writeRecord(user: User): Unit = { 39 | 40 | println("get user : " + user.toString) 41 | ps = conn.prepareStatement("insert into async.user(username, password, sex, phone) values(?,?,?,?)") 42 | ps.setString(1, user.username) 43 | ps.setString(2, user.password) 44 | ps.setInt(3, user.sex) 45 | ps.setString(4, user.phone) 46 | 47 | ps.execute() 48 | conn.commit() 49 | } 50 | 51 | override def close(): Unit = { 52 | 53 | if (conn != null){ 54 | conn.commit() 55 | conn.close() 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/common/Common.java: -------------------------------------------------------------------------------- 1 | package com.venn.common; 2 | 3 | 4 | import java.util.Properties; 5 | 6 | /** 7 | * Created by venn on 19-3-5. 8 | */ 9 | public class Common { 10 | 11 | public final static String BROKER_LIST = "venn:9092"; 12 | public final static String ZOOKEEPER_QUORUM = "venn"; 13 | public final static String ZOOKEEPER_PORT = "2180"; 14 | public final static String ZOOKEEPER_ZNODE_PARENT = "venn:9092"; 15 | public final static String PULSAR_SERVER = "pulsar://localhost:6650"; 16 | public final static String PULSAR_ADMIN = "http://localhost:8080"; 17 | public final static String CHECK_POINT_DATA_DIR = "hdfs:///home/wuxu/tmp/checkpoint"; 18 | // public final static String CHECK_POINT_DATA_DIR = "file:///out/checkpoint"; 19 | // public final static String CHECK_POINT_DATA_DIR = "hdfs:///venn/checkpoint"; 20 | 21 | public static Properties prop = null; 22 | 23 | public static Properties getProp(String brokerList) { 24 | 25 | if (prop == null) { 26 | prop = new Properties(); 27 | prop.put("bootstrap.servers", brokerList); 28 | prop.put("request.required.acks", "-1"); 29 | prop.put("auto.offset.reset", "latest"); 30 | prop.put("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); 31 | prop.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer"); 32 | prop.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 33 | prop.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); 34 | prop.put("group.id", "venn"); 35 | prop.put("client.id", "venn"); 36 | } 37 | return prop; 38 | } 39 | 40 | public static Properties getProp() { 41 | 42 | return getProp(BROKER_LIST); 43 | } 44 | 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/venn/entity/KafkaSimpleStringRecord.java: -------------------------------------------------------------------------------- 1 | package com.venn.entity; 2 | 3 | import org.apache.kafka.common.TopicPartition; 4 | 5 | import java.io.Serializable; 6 | 7 | /** 8 | * generic string kafka record, ues by @MyKafkaRecordDeserializationSchema 9 | */ 10 | public class KafkaSimpleStringRecord implements Serializable { 11 | private static final long serialVersionUID = 4813439951036021779L; 12 | // kafka topic partition 13 | private final TopicPartition tp; 14 | // record kafka offset 15 | private final long offset; 16 | // record key 17 | private final String key; 18 | // record timestamp 19 | private final long timestamp; 20 | // record value 21 | private final String value; 22 | 23 | 24 | public KafkaSimpleStringRecord(TopicPartition tp, long offset, String key, long timestamp, String value) { 25 | this.tp = tp; 26 | this.offset = offset; 27 | this.key = key; 28 | this.timestamp = timestamp; 29 | this.value = value; 30 | } 31 | 32 | public static long getSerialVersionUID() { 33 | return serialVersionUID; 34 | } 35 | 36 | public TopicPartition getTp() { 37 | return tp; 38 | } 39 | 40 | public long getOffset() { 41 | return offset; 42 | } 43 | 44 | public String getKey() { 45 | return key; 46 | } 47 | 48 | public long getTimestamp() { 49 | return timestamp; 50 | } 51 | 52 | public String getValue() { 53 | return value; 54 | } 55 | 56 | @Override 57 | public String toString() { 58 | return "MyStringKafkaRecord{" + 59 | "tp=topic:" + tp.topic() + ", partition: " + tp.partition() + 60 | ", offset=" + offset + 61 | ", key='" + key + '\'' + 62 | ", timestamp=" + timestamp + 63 | ", value='" + value + '\'' + 64 | '}'; 65 | } 66 | } -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/RichAsyncFunction.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.venn.source 20 | 21 | import org.apache.flink.api.common.functions.AbstractRichFunction 22 | import org.apache.flink.streaming.api.scala.async.AsyncFunction 23 | 24 | /** 25 | * Rich variant of [[AsyncFunction]]. As a [[org.apache.flink.api.common.functions.RichFunction]], 26 | * it gives access to the [[org.apache.flink.api.common.functions.RuntimeContext]] and provides 27 | * setup and teardown methods. 28 | * 29 | * State related apis in [[org.apache.flink.api.common.functions.RuntimeContext]] are not supported 30 | * yet because the key may get changed while accessing states in the working thread. 31 | * 32 | * [[org.apache.flink.api.common.functions.IterationRuntimeContext#getIterationAggregator(String)]] 33 | * is not supported since the aggregator may be modified by multiple threads. 34 | * 35 | * @tparam IN The type of the input value. 36 | * @tparam OUT The type of the output value. 37 | */ 38 | abstract class RichAsyncFunction[IN, OUT] 39 | extends AbstractRichFunction 40 | with AsyncFunction [IN, OUT] {} 41 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/demo/SlotPartitionMaker.scala: -------------------------------------------------------------------------------- 1 | package com.venn.demo 2 | 3 | import com.google.gson.GsonBuilder 4 | 5 | import java.text.SimpleDateFormat 6 | import java.util.{Calendar, Date} 7 | import com.venn.common.Common 8 | import com.venn.util.MathUtil 9 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 10 | 11 | 12 | /** 13 | * test data maker 14 | */ 15 | 16 | object SlotPartitionMaker { 17 | 18 | var minute: Int = 1 19 | val calendar: Calendar = Calendar.getInstance() 20 | /** 21 | * 一天时间比较长,不方便观察,将时间改为当前时间, 22 | * 每次累加10分钟,这样一天只需要144次循环,也就是144秒 23 | * 24 | * @return 25 | */ 26 | def getCreateTime(): String = { 27 | // minute = minute + 1 28 | calendar.add(Calendar.MILLISECOND, 10) 29 | sdf.format(calendar.getTime) 30 | } 31 | 32 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") 33 | 34 | def main(args: Array[String]): Unit = { 35 | 36 | val prop = Common.getProp 37 | prop.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 38 | prop.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 39 | 40 | val producer = new KafkaProducer[String, String](Common.getProp) 41 | calendar.setTime(new Date()) 42 | println(sdf.format(calendar.getTime)) 43 | var i = 0; 44 | while (true) { 45 | val map = Map("id" -> i, "createTime" -> getCreateTime(), "amt" -> (MathUtil.random.nextInt(10) + "." + MathUtil.random.nextInt(10))) 46 | val gson = new GsonBuilder().create(); 47 | gson.toJson(map); 48 | println(gson.toString()) 49 | // topic current_day 50 | val msg = new ProducerRecord[String, String]("slot_partition", gson.toString()) 51 | producer.send(msg) 52 | producer.flush() 53 | if (MathUtil.random.nextBoolean()) { 54 | Thread.sleep(1500) 55 | } else { 56 | Thread.sleep(500) 57 | 58 | } 59 | i = i + 1 60 | // System.exit(-1) 61 | } 62 | } 63 | 64 | } 65 | 66 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/entry/OverStockDetail.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.entry; 2 | 3 | import java.math.BigDecimal; 4 | 5 | /** 6 | * @Classname OverStockDetail 7 | * @Description TODO 8 | * @Date 2023/6/8 9 | * @Created by venn 10 | */ 11 | public class OverStockDetail implements Stock{ 12 | 13 | private int id; 14 | private String fid; 15 | private String fentryId; 16 | private String fmaterialId; 17 | private BigDecimal frealQty; 18 | 19 | public OverStockDetail() { 20 | } 21 | 22 | public OverStockDetail(int id, String fid, String fentryid, String fmaterialid, BigDecimal frealqty) { 23 | this.id = id; 24 | this.fid = fid; 25 | this.fentryId = fentryid; 26 | this.fmaterialId = fmaterialid; 27 | this.frealQty = frealqty; 28 | } 29 | 30 | public int getId() { 31 | return id; 32 | } 33 | 34 | public void setId(int id) { 35 | this.id = id; 36 | } 37 | 38 | public String getFid() { 39 | return fid; 40 | } 41 | 42 | public void setFid(String fid) { 43 | this.fid = fid; 44 | } 45 | 46 | public String getFentryId() { 47 | return fentryId; 48 | } 49 | 50 | public void setFentryId(String fentryId) { 51 | this.fentryId = fentryId; 52 | } 53 | 54 | public String getFmaterialId() { 55 | return fmaterialId; 56 | } 57 | 58 | public void setFmaterialId(String fmaterialId) { 59 | this.fmaterialId = fmaterialId; 60 | } 61 | 62 | public BigDecimal getFrealQty() { 63 | return frealQty; 64 | } 65 | 66 | public void setFrealQty(BigDecimal frealQty) { 67 | this.frealQty = frealQty; 68 | } 69 | 70 | @Override 71 | public String toString() { 72 | return "OverStockDetail{" + 73 | "id=" + id + 74 | ", fid='" + fid + '\'' + 75 | ", fentryId='" + fentryId + '\'' + 76 | ", fmaterialId='" + fmaterialId + '\'' + 77 | ", frealQty=" + frealQty + 78 | '}'; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/cdcStarrocks/CdcStarMapFunction.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.cdcStarrocks; 2 | 3 | import com.google.gson.JsonElement; 4 | import com.google.gson.JsonObject; 5 | import com.google.gson.JsonParser; 6 | import org.apache.flink.api.common.functions.RichMapFunction; 7 | import org.apache.flink.configuration.Configuration; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.util.Map; 12 | 13 | public class CdcStarMapFunction extends RichMapFunction { 14 | 15 | private final static Logger LOG = LoggerFactory.getLogger(CdcStarMapFunction.class); 16 | private JsonParser parser; 17 | 18 | @Override 19 | public void open(Configuration parameters) throws Exception { 20 | parser = new JsonParser(); 21 | } 22 | 23 | @Override 24 | public CdcRecord map(String element) throws Exception { 25 | 26 | LOG.info("data : {}", element); 27 | JsonObject object = parser.parse(element).getAsJsonObject(); 28 | String db = object.get("db").getAsString(); 29 | String table = object.get("table").getAsString(); 30 | String op = object.get("operator_type").getAsString(); 31 | 32 | CdcRecord record = new CdcRecord(db, table, op); 33 | 34 | // insert/update 35 | String dataLocation = "after"; 36 | if ("d".equals(op)) { 37 | // if op is delete, get before 38 | dataLocation = "before"; 39 | } 40 | 41 | JsonObject data = object.get(dataLocation).getAsJsonObject(); 42 | 43 | for (Map.Entry entry : data.entrySet()) { 44 | 45 | String columnName = entry.getKey(); 46 | String columnValue; 47 | JsonElement value = entry.getValue(); 48 | if (!value.isJsonNull()) { 49 | // if column value is not null, get as string 50 | columnValue = value.getAsString(); 51 | // put column name/value to record.data 52 | record.getData().put(columnName, columnValue); 53 | } 54 | 55 | } 56 | 57 | return record; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/mysql/cdc/CommonKafkaSink.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.mysql.cdc; 2 | 3 | import com.google.gson.JsonObject; 4 | import com.google.gson.JsonParser; 5 | import org.apache.flink.configuration.Configuration; 6 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; 7 | import org.apache.kafka.clients.producer.KafkaProducer; 8 | import org.apache.kafka.clients.producer.ProducerRecord; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import java.util.Properties; 12 | 13 | public class CommonKafkaSink extends RichSinkFunction { 14 | 15 | protected static final Logger LOG = LoggerFactory.getLogger(CommonKafkaSink.class); 16 | private transient KafkaProducer kafkaProducer; 17 | private transient JsonParser parser; 18 | private final String bootstrapServer; 19 | 20 | public CommonKafkaSink(String bootstrapServer) { 21 | this.bootstrapServer = bootstrapServer; 22 | } 23 | 24 | 25 | @Override 26 | public void open(Configuration parameters) { 27 | Properties prop = new Properties(); 28 | prop.put("bootstrap.servers", bootstrapServer); 29 | prop.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 30 | prop.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 31 | prop.put("request.timeout.ms", "10"); 32 | kafkaProducer = new KafkaProducer<>(prop); 33 | parser = new JsonParser(); 34 | 35 | } 36 | 37 | @Override 38 | public void invoke(String element, Context context) { 39 | 40 | JsonObject jsonObject = parser.parse(element).getAsJsonObject(); 41 | String db = jsonObject.get("db").getAsString(); 42 | String table = jsonObject.get("table").getAsString(); 43 | // topic 不存在就自动创建 44 | String topic = db + "_" + table; 45 | topic = db; 46 | ProducerRecord record = new ProducerRecord<>(topic, element); 47 | kafkaProducer.send(record); 48 | } 49 | 50 | @Override 51 | public void close() { 52 | kafkaProducer.close(); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/venn/util/SimpleKafkaRecordDeserializationSchema.java: -------------------------------------------------------------------------------- 1 | package com.venn.util; 2 | 3 | import com.venn.entity.KafkaSimpleStringRecord; 4 | import org.apache.flink.api.common.serialization.DeserializationSchema; 5 | import org.apache.flink.api.common.typeinfo.TypeInformation; 6 | import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; 7 | import org.apache.flink.util.Collector; 8 | import org.apache.kafka.clients.consumer.ConsumerRecord; 9 | import org.apache.kafka.common.TopicPartition; 10 | import org.apache.kafka.common.serialization.Deserializer; 11 | import org.apache.kafka.common.serialization.StringDeserializer; 12 | 13 | import java.io.IOException; 14 | 15 | public class SimpleKafkaRecordDeserializationSchema 16 | implements KafkaRecordDeserializationSchema { 17 | private static final long serialVersionUID = -3765473065594331694L; 18 | private transient Deserializer deserializer; 19 | 20 | @Override 21 | public void open(DeserializationSchema.InitializationContext context) throws Exception { 22 | 23 | } 24 | 25 | @Override 26 | public void deserialize( 27 | ConsumerRecord record, Collector collector) 28 | throws IOException { 29 | if (deserializer == null) { 30 | deserializer = new StringDeserializer(); 31 | } 32 | long offset = record.offset(); 33 | String key = null; 34 | if (record.key() != null) { 35 | key = new String(record.key()); 36 | } 37 | long timestamp = record.timestamp(); 38 | 39 | 40 | // makeup MyStringKafkaRecord 41 | KafkaSimpleStringRecord myRecord = new KafkaSimpleStringRecord( 42 | new TopicPartition(record.topic(), record.partition()), offset, key, timestamp, deserializer.deserialize(record.topic(), record.value())); 43 | 44 | collector.collect(myRecord); 45 | } 46 | 47 | @Override 48 | public TypeInformation getProducedType() { 49 | return TypeInformation.of(KafkaSimpleStringRecord.class); 50 | } 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/entry/OverStock.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.entry; 2 | 3 | /** 4 | * @Classname OverStock 5 | * @Description TODO 6 | * @Date 2023/6/8 7 | * @Created by venn 8 | */ 9 | public class OverStock implements Stock{ 10 | 11 | private int id; 12 | private long fdate; 13 | private String fid; 14 | private String fbillNo; 15 | private String fcustomerId; 16 | private String fGjzh; 17 | 18 | public OverStock() { 19 | } 20 | 21 | public OverStock(int id, long fdate, String fid, String fbillno, String fcustomerid, String fGjzh) { 22 | this.id = id; 23 | this.fdate = fdate; 24 | this.fid = fid; 25 | this.fbillNo = fbillno; 26 | this.fcustomerId = fcustomerid; 27 | this.fGjzh = fGjzh; 28 | } 29 | 30 | public int getId() { 31 | return id; 32 | } 33 | 34 | public void setId(int id) { 35 | this.id = id; 36 | } 37 | 38 | public long getFdate() { 39 | return fdate; 40 | } 41 | 42 | public void setFdate(long fdate) { 43 | this.fdate = fdate; 44 | } 45 | 46 | public String getFid() { 47 | return fid; 48 | } 49 | 50 | public void setFid(String fid) { 51 | this.fid = fid; 52 | } 53 | 54 | public String getFbillNo() { 55 | return fbillNo; 56 | } 57 | 58 | public void setFbillNo(String fbillNo) { 59 | this.fbillNo = fbillNo; 60 | } 61 | 62 | public String getFcustomerId() { 63 | return fcustomerId; 64 | } 65 | 66 | public void setFcustomerId(String fcustomerId) { 67 | this.fcustomerId = fcustomerId; 68 | } 69 | 70 | public String getfGjzh() { 71 | return fGjzh; 72 | } 73 | 74 | public void setfGjzh(String fGjzh) { 75 | this.fGjzh = fGjzh; 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return "OverStock{" + 81 | "id=" + id + 82 | ", fdate=" + fdate + 83 | ", fid='" + fid + '\'' + 84 | ", fbillNo='" + fbillNo + '\'' + 85 | ", fcustomerId='" + fcustomerId + '\'' + 86 | ", fGjzh='" + fGjzh + '\'' + 87 | '}'; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/com/venn/entity/UserLog.java: -------------------------------------------------------------------------------- 1 | package com.venn.entity; 2 | 3 | import com.google.gson.annotations.SerializedName; 4 | 5 | public class UserLog { 6 | 7 | @SerializedName("user_id") 8 | private String userId; 9 | @SerializedName("item_id") 10 | private String itemId; 11 | @SerializedName("category_id") 12 | private String categoryId; 13 | private String behavior; 14 | private String ts; 15 | private Long timestamp; 16 | 17 | public UserLog() { 18 | } 19 | 20 | public UserLog(String userId, String itemId, String categoryId, String behavior, String ts) { 21 | this.userId = userId; 22 | this.itemId = itemId; 23 | this.categoryId = categoryId; 24 | this.behavior = behavior; 25 | this.ts = ts; 26 | } 27 | 28 | public Long getTimestamp() { 29 | return timestamp; 30 | } 31 | 32 | public void setTimestamp(Long timestamp) { 33 | this.timestamp = timestamp; 34 | } 35 | 36 | public String getUserId() { 37 | return userId; 38 | } 39 | 40 | public void setUserId(String userId) { 41 | this.userId = userId; 42 | } 43 | 44 | public String getItemId() { 45 | return itemId; 46 | } 47 | 48 | public void setItemId(String itemId) { 49 | this.itemId = itemId; 50 | } 51 | 52 | public String getCategoryId() { 53 | return categoryId; 54 | } 55 | 56 | public void setCategoryId(String categoryId) { 57 | this.categoryId = categoryId; 58 | } 59 | 60 | public String getBehavior() { 61 | return behavior; 62 | } 63 | 64 | public void setBehavior(String behavior) { 65 | this.behavior = behavior; 66 | } 67 | 68 | public String getTs() { 69 | return ts; 70 | } 71 | 72 | public void setTs(String ts) { 73 | this.ts = ts; 74 | } 75 | 76 | @Override 77 | public String toString() { 78 | return "UserLog{" + 79 | "userId='" + userId + '\'' + 80 | ", itemId='" + itemId + '\'' + 81 | ", categoryId='" + categoryId + '\'' + 82 | ", behavior='" + behavior + '\'' + 83 | ", ts='" + ts + '\'' + 84 | '}'; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/pulsar/PulsarDemo.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.connector.pulsar 2 | // 3 | //import java.time.Duration 4 | // 5 | //import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | //import org.apache.flink.connector.pulsar.source.PulsarSource 7 | //import org.apache.flink.connector.pulsar.source.enumerator.cursor.StartCursor 8 | //import org.apache.flink.connector.pulsar.source.reader.deserializer.PulsarDeserializationSchema 9 | //import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 10 | //import org.apache.pulsar.client.api.SubscriptionType 11 | //import org.apache.flink.api.scala._ 12 | //import com.venn.common.Common.PULSAR_SERVER 13 | //import com.venn.common.Common.PULSAR_ADMIN 14 | //import org.apache.flink.api.common.eventtime.WatermarkStrategy 15 | //import org.apache.flink.streaming.api.functions.ProcessFunction 16 | //import org.apache.flink.util.Collector 17 | // 18 | //object PulsarDemo { 19 | // 20 | // def main(args: Array[String]): Unit = { 21 | // val env = StreamExecutionEnvironment.getExecutionEnvironment 22 | // env.setParallelism(1) 23 | // 24 | // val pulsarSource = PulsarSource.builder() 25 | // .setServiceUrl(PULSAR_SERVER) 26 | // .setAdminUrl(PULSAR_ADMIN) 27 | // .setStartCursor(StartCursor.earliest()) 28 | // .setTopics("user_log") 29 | // .setDeserializationSchema(PulsarDeserializationSchema.flinkSchema(new SimpleStringSchema())) 30 | // .setSubscriptionName("my-subscription") 31 | // .setSubscriptionType(SubscriptionType.Exclusive) 32 | // .build() 33 | // 34 | // //env.fromSource(pulsarSource, WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(5))) 35 | // env.fromSource(pulsarSource, WatermarkStrategy.noWatermarks(), "pulsar") 36 | // .map(str => str) 37 | // .process(new ProcessFunction[String, String] { 38 | // var count: Long = 0 39 | // 40 | // override def processElement(element: String, ctx: ProcessFunction[String, String]#Context, out: Collector[String]): Unit = { 41 | // count += 1 42 | // if (count % 1000 == 0) { 43 | // println("count: ", count) 44 | // } 45 | // } 46 | // }) 47 | // 48 | // 49 | // env.execute("pulsar demo") 50 | // 51 | // 52 | // } 53 | // 54 | //} 55 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/util/CheckpointUtil.scala: -------------------------------------------------------------------------------- 1 | package com.venn.util 2 | 3 | import org.apache.flink.runtime.state.StateBackend 4 | import org.apache.flink.runtime.state.hashmap.HashMapStateBackend 5 | import org.apache.flink.streaming.api.CheckpointingMode 6 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 7 | import org.apache.flink.contrib.streaming.state.EmbeddedRocksDBStateBackend 8 | 9 | object CheckpointUtil { 10 | 11 | def setCheckpoint(env: StreamExecutionEnvironment, stateBackendStr: String, checkpointPath: String, interval: Long, timeOut: Long) = { 12 | var stateBackend: StateBackend = null 13 | if ("rocksdb".equals(stateBackendStr)) { 14 | stateBackend = new EmbeddedRocksDBStateBackend(true) 15 | } else { 16 | stateBackend = new HashMapStateBackend() 17 | } 18 | env.setStateBackend(stateBackend) 19 | // checkpoint 20 | env.enableCheckpointing(interval * 1000, CheckpointingMode.EXACTLY_ONCE) 21 | env.getCheckpointConfig.setCheckpointTimeout(timeOut * 1000) 22 | // Flink 1.11.0 new feature: Enables unaligned checkpoints 23 | env.getCheckpointConfig.enableUnalignedCheckpoints() 24 | // checkpoint dir 25 | env.getCheckpointConfig.setCheckpointStorage(checkpointPath) 26 | 27 | } 28 | 29 | /** 30 | * 31 | * @param env 32 | * @param stateBackendStr state backend: rocksdb, other 33 | * @param checkpointPath checkpoint path 34 | * @param interval second 35 | */ 36 | def setCheckpoint(env: StreamExecutionEnvironment, stateBackendStr: String, checkpointPath: String, interval: Long) = { 37 | var stateBackend: StateBackend = null 38 | if ("rocksdb".equals(stateBackendStr)) { 39 | stateBackend = new EmbeddedRocksDBStateBackend(true) 40 | } else { 41 | stateBackend = new HashMapStateBackend() 42 | } 43 | env.setStateBackend(stateBackend) 44 | // checkpoint 45 | env.enableCheckpointing(interval * 1000, CheckpointingMode.EXACTLY_ONCE) 46 | // env.getCheckpointConfig.setCheckpointTimeout(timeOut * 1000) 47 | // Flink 1.11.0 new feature: Enables unaligned checkpoints 48 | env.getCheckpointConfig.enableUnalignedCheckpoints() 49 | // checkpoint dir 50 | env.getCheckpointConfig.setCheckpointStorage(checkpointPath) 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/dynamicWindow/DyTumblingWindow.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.dynamicWindow; 2 | 3 | import org.apache.flink.api.common.ExecutionConfig; 4 | import org.apache.flink.api.common.typeutils.TypeSerializer; 5 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 6 | import org.apache.flink.streaming.api.windowing.assigners.WindowAssigner; 7 | import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger; 8 | import org.apache.flink.streaming.api.windowing.triggers.Trigger; 9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 10 | 11 | import java.util.Collection; 12 | import java.util.Collections; 13 | 14 | /** 15 | * flink dynamic window 16 | */ 17 | public class DyTumblingWindow extends WindowAssigner { 18 | 19 | private final long size; 20 | 21 | private final long offset; 22 | 23 | protected DyTumblingWindow(long size, long offset) { 24 | if (Math.abs(offset) >= size) { 25 | throw new IllegalArgumentException("TumblingEventTimeWindows parameters must satisfy abs(offset) < size"); 26 | } 27 | 28 | this.size = size; 29 | this.offset = offset; 30 | } 31 | 32 | @Override 33 | public Collection assignWindows(Object element, long timestamp, WindowAssignerContext context) { 34 | 35 | if (timestamp > Long.MIN_VALUE) { 36 | long start = TimeWindow.getWindowStartWithOffset(timestamp, offset, size); 37 | return Collections.singletonList(new TimeWindow(start, start + size)); 38 | } else { 39 | throw new RuntimeException("Record has Long.MIN_VALUE timestamp (= no timestamp marker). " + 40 | "Is the time characteristic set to 'ProcessingTime', or did you forget to call " + 41 | "'DataStream.assignTimestampsAndWatermarks(...)'?"); 42 | } 43 | } 44 | 45 | @Override 46 | public Trigger getDefaultTrigger(StreamExecutionEnvironment env) { 47 | return EventTimeTrigger.create(); 48 | } 49 | 50 | @Override 51 | public TypeSerializer getWindowSerializer(ExecutionConfig executionConfig) { 52 | return null; 53 | } 54 | 55 | @Override 56 | public boolean isEventTime() { 57 | return false; 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/venn/demo/KafkaJoinRedisDemo.java: -------------------------------------------------------------------------------- 1 | package com.venn.demo; 2 | 3 | import com.venn.entity.KafkaSimpleStringRecord; 4 | import com.venn.util.SimpleKafkaRecordDeserializationSchema; 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy; 6 | import org.apache.flink.api.common.functions.MapFunction; 7 | import org.apache.flink.connector.kafka.source.KafkaSource; 8 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; 9 | import org.apache.flink.streaming.api.datastream.AsyncDataStream; 10 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; 11 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 12 | 13 | import java.util.concurrent.TimeUnit; 14 | 15 | public class KafkaJoinRedisDemo { 16 | 17 | private static final String uri = "redis://localhost"; 18 | private static final String bootstrapServer = "localhost:9092"; 19 | private static final String topic = "user_log"; 20 | 21 | public static void main(String[] args) throws Exception { 22 | 23 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 24 | env.setParallelism(1); 25 | 26 | // kafka source 27 | KafkaSource kafkaSource = KafkaSource 28 | .builder() 29 | .setBootstrapServers(bootstrapServer) 30 | .setDeserializer(new SimpleKafkaRecordDeserializationSchema()) 31 | .setStartingOffsets(OffsetsInitializer.latest()) 32 | .setTopics(topic) 33 | .build(); 34 | 35 | 36 | // get value 37 | SingleOutputStreamOperator source = env 38 | .fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource") 39 | .map((MapFunction) value -> value.getValue()); 40 | 41 | // async redis 42 | AsyncRedisFunction asyncRedisFunction = new AsyncRedisFunction(uri); 43 | SingleOutputStreamOperator asyncStream = AsyncDataStream 44 | .unorderedWait(source, asyncRedisFunction, 5L, TimeUnit.SECONDS); 45 | 46 | // print result 47 | asyncStream 48 | .print("match redis"); 49 | 50 | env.execute("kafkaJoinRedis"); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/mysql/cdc/Binlog.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.mysql.cdc; 2 | 3 | public class Binlog { 4 | private String host; 5 | private int port; 6 | private String db; 7 | private String table; 8 | private String file; 9 | private Long pos; 10 | private Long tsSec; 11 | private String operatorType; 12 | private String data; 13 | private String source; 14 | 15 | public Binlog() { 16 | } 17 | 18 | public Binlog(String host, int port) { 19 | this.host = host; 20 | this.port = port; 21 | } 22 | 23 | public Long getTsSec() { 24 | return tsSec; 25 | } 26 | 27 | public void setTsSec(Long tsSec) { 28 | this.tsSec = tsSec; 29 | } 30 | 31 | public String getHost() { 32 | return host; 33 | } 34 | 35 | public void setHost(String host) { 36 | this.host = host; 37 | } 38 | 39 | public int getPort() { 40 | return port; 41 | } 42 | 43 | public void setPort(int port) { 44 | this.port = port; 45 | } 46 | 47 | public String getSource() { 48 | return source; 49 | } 50 | 51 | public void setSource(String source) { 52 | this.source = source; 53 | } 54 | 55 | public String getDb() { 56 | return db; 57 | } 58 | 59 | public void setDb(String db) { 60 | this.db = db; 61 | } 62 | 63 | public String getTable() { 64 | return table; 65 | } 66 | 67 | public void setTable(String table) { 68 | this.table = table; 69 | } 70 | 71 | public String getFile() { 72 | return file; 73 | } 74 | 75 | public void setFile(String file) { 76 | this.file = file; 77 | } 78 | 79 | public Long getPos() { 80 | return pos; 81 | } 82 | 83 | public void setPos(Long pos) { 84 | this.pos = pos; 85 | } 86 | 87 | public String getOperatorType() { 88 | return operatorType; 89 | } 90 | 91 | public void setOperatorType(String operatorType) { 92 | this.operatorType = operatorType; 93 | } 94 | 95 | public String getData() { 96 | return data; 97 | } 98 | 99 | public void setData(String data) { 100 | this.data = data; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/UserClue/UserClue.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.question.UserClue 2 | // 3 | //import com.venn.entity.KafkaSimpleStringRecord 4 | //import com.venn.util.SimpleKafkaRecordDeserializationSchema 5 | //import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | //import org.apache.flink.api.common.functions.RichMapFunction 7 | //import org.apache.flink.connector.kafka.source.KafkaSource 8 | //import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 9 | //import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema 10 | //import org.apache.flink.formats.json.JsonNodeDeserializationSchema 11 | //import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 12 | //import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 13 | //import org.apache.flink.streaming.connectors.kafka.internals.KafkaDeserializationSchemaWrapper 14 | //import org.apache.flink.streaming.util.serialization.JSONKeyValueDeserializationSchema 15 | //import org.apache.flink.api.scala._ 16 | // 17 | //import java.util 18 | // 19 | //object UserClue { 20 | // val bootstrapServer = "localhost:9092" 21 | // val topic = "user_log" 22 | // val sinkTopic = "user_log_sink" 23 | // 24 | // def main(args: Array[String]): Unit = { 25 | // 26 | // val env = StreamExecutionEnvironment.getExecutionEnvironment 27 | // env.setParallelism(1) 28 | // 29 | // val source = KafkaSource 30 | // .builder()[ObjectNode] 31 | // .setBootstrapServers(bootstrapServer) 32 | // .setGroupId("MyGroup") 33 | // .setClientIdPrefix("aa") 34 | // .setTopics(util.Arrays.asList("user_log")) 35 | // .setDeserializer(KafkaRecordDeserializationSchema.of(new JSONKeyValueDeserializationSchema(true))) 36 | //// .setDeserializer(new KafkaDeserializationSchemaWrapper()) 37 | // // .setStartingOffsets(OffsetsInitializer.earliest()) 38 | // .setStartingOffsets(OffsetsInitializer.latest()) 39 | // .build() 40 | // 41 | // env.fromSource(source, WatermarkStrategy.noWatermarks(), "source") 42 | // .map(new RichMapFunction[ObjectNode, ObjectNode] { 43 | // override def map(node: ObjectNode): ObjectNode = { 44 | // 45 | // val userId = node.get("user_id") 46 | // 47 | // node 48 | // } 49 | // }) 50 | // 51 | // 52 | // 53 | // } 54 | // 55 | //} 56 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/jdbcOutput/MysqlSink.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.jdbcOutput 2 | 3 | import java.sql.{Connection, DriverManager, PreparedStatement, SQLException} 4 | import org.apache.flink.configuration.Configuration 5 | import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction} 6 | import org.slf4j.{Logger, LoggerFactory} 7 | 8 | class MysqlSink extends RichSinkFunction[User] { 9 | 10 | val logger: Logger = LoggerFactory.getLogger("MysqlSink") 11 | var conn: Connection = _ 12 | var ps: PreparedStatement = _ 13 | val jdbcUrl = "jdbc:mysql://192.168.229.128:3306?useSSL=false&allowPublicKeyRetrieval=true" 14 | val username = "root" 15 | val password = "123456" 16 | val driverName = "com.mysql.jdbc.Driver" 17 | 18 | override def open(parameters: Configuration): Unit = { 19 | 20 | Class.forName(driverName) 21 | try { 22 | Class.forName(driverName) 23 | conn = DriverManager.getConnection(jdbcUrl, username, password) 24 | 25 | // close auto commit 26 | conn.setAutoCommit(false) 27 | } catch { 28 | case e@(_: ClassNotFoundException | _: SQLException) => 29 | logger.error("init mysql error") 30 | e.printStackTrace() 31 | System.exit(-1); 32 | } 33 | } 34 | 35 | override def invoke(user: User, context: SinkFunction.Context): Unit = { 36 | println("get user : " + user.toString) 37 | ps = conn.prepareStatement("insert into async.user(username, password, sex, phone) values(?,?,?,?)") 38 | ps.setString(1, user.username) 39 | ps.setString(2, user.password) 40 | ps.setInt(3, user.sex) 41 | ps.setString(4, user.phone) 42 | 43 | ps.execute() 44 | conn.commit() 45 | } 46 | 47 | /** 48 | * 吞吐量不够话,可以将数据暂存在状态中,批量提交的方式提高吞吐量(如果oom,可能就是数据量太大,资源没有及时释放导致的) 49 | * 50 | */ 51 | // override def invoke(user: User, context: SinkFunction.Context[_]): Unit = { 52 | // println("get user : " + user.toString) 53 | // ps = conn.prepareStatement("insert into async.user(username, password, sex, phone) values(?,?,?,?)") 54 | // ps.setString(1, user.username) 55 | // ps.setString(2, user.password) 56 | // ps.setInt(3, user.sex) 57 | // ps.setString(4, user.phone) 58 | // 59 | // ps.execute() 60 | // conn.commit() 61 | // } 62 | 63 | 64 | override def close(): Unit = { 65 | if (conn != null) { 66 | conn.commit() 67 | conn.close() 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/entry/StockList.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.entry; 2 | 3 | /** 4 | * @Classname StockList 5 | * @Description TODO 6 | * @Date 2023/6/8 7 | * @Created by venn 8 | */ 9 | public class StockList { 10 | 11 | private int id; 12 | private long createTime; 13 | private String outStockCode; 14 | private String createOp; 15 | private String distributorCode; 16 | private String inoutType; 17 | 18 | public StockList() { 19 | } 20 | 21 | public StockList(int id, long fdate, String fid, String fbillno, String fcustomerid, String fGjzh) { 22 | this.id = id; 23 | this.createTime = fdate; 24 | this.outStockCode = fid; 25 | this.createOp = fbillno; 26 | this.distributorCode = fcustomerid; 27 | this.inoutType = fGjzh; 28 | } 29 | 30 | public int getId() { 31 | return id; 32 | } 33 | 34 | public void setId(int id) { 35 | this.id = id; 36 | } 37 | 38 | public long getCreateTime() { 39 | return createTime; 40 | } 41 | 42 | public void setCreateTime(long createTime) { 43 | this.createTime = createTime; 44 | } 45 | 46 | public String getOutStockCode() { 47 | return outStockCode; 48 | } 49 | 50 | public void setOutStockCode(String outStockCode) { 51 | this.outStockCode = outStockCode; 52 | } 53 | 54 | public String getCreateOp() { 55 | return createOp; 56 | } 57 | 58 | public void setCreateOp(String createOp) { 59 | this.createOp = createOp; 60 | } 61 | 62 | public String getDistributorCode() { 63 | return distributorCode; 64 | } 65 | 66 | public void setDistributorCode(String distributorCode) { 67 | this.distributorCode = distributorCode; 68 | } 69 | 70 | public String getInoutType() { 71 | return inoutType; 72 | } 73 | 74 | public void setInoutType(String inoutType) { 75 | this.inoutType = inoutType; 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return "StockList{" + 81 | "id=" + id + 82 | ", createTime=" + createTime + 83 | ", outStockCode='" + outStockCode + '\'' + 84 | ", createOp='" + createOp + '\'' + 85 | ", distributorCode='" + distributorCode + '\'' + 86 | ", inoutType='" + inoutType + '\'' + 87 | '}'; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/util/HttpClientUtil.java: -------------------------------------------------------------------------------- 1 | package com.venn.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.net.HttpURLConnection; 8 | import java.net.MalformedURLException; 9 | import java.net.URL; 10 | 11 | public class HttpClientUtil { 12 | 13 | public static String doGet(String httpurl) throws IOException { 14 | HttpURLConnection connection = null; 15 | InputStream is = null; 16 | BufferedReader br = null; 17 | String result = null;// 返回结果字符串 18 | try { 19 | // 创建远程url连接对象 20 | URL url = new URL(httpurl); 21 | // 通过远程url连接对象打开一个连接,强转成httpURLConnection类 22 | connection = (HttpURLConnection) url.openConnection(); 23 | // 设置连接方式:get 24 | connection.setRequestMethod("GET"); 25 | // 设置连接主机服务器的超时时间:15000毫秒 26 | connection.setConnectTimeout(15000); 27 | // 设置读取远程返回的数据时间:60000毫秒 28 | connection.setReadTimeout(60000); 29 | // 发送请求 30 | connection.connect(); 31 | // 通过connection连接,获取输入流 32 | if (connection.getResponseCode() == 200) { 33 | is = connection.getInputStream(); 34 | // 封装输入流is,并指定字符集 35 | br = new BufferedReader(new InputStreamReader(is, "UTF-8")); 36 | // 存放数据 37 | StringBuffer sbf = new StringBuffer(); 38 | String temp = null; 39 | while ((temp = br.readLine()) != null) { 40 | sbf.append(temp); 41 | sbf.append("\r\n"); 42 | } 43 | result = sbf.toString(); 44 | } 45 | } catch (MalformedURLException e) { 46 | e.printStackTrace(); 47 | } catch (IOException e) { 48 | e.printStackTrace(); 49 | } finally { 50 | // 关闭资源 51 | if (null != br) { 52 | try { 53 | br.close(); 54 | } catch (IOException e) { 55 | e.printStackTrace(); 56 | } 57 | } 58 | 59 | if (null != is) { 60 | try { 61 | is.close(); 62 | } catch (IOException e) { 63 | e.printStackTrace(); 64 | } 65 | } 66 | 67 | connection.disconnect();// 关闭远程连接 68 | } 69 | 70 | return result; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/stock/entry/StockListDetail.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.stock.entry; 2 | 3 | import java.math.BigDecimal; 4 | 5 | /** 6 | * @Classname StockListDetail 7 | * @Description TODO 8 | * @Date 2023/6/8 9 | * @Created by venn 10 | */ 11 | public class StockListDetail { 12 | 13 | private int id; 14 | private long createTime; 15 | private String outStockCode; 16 | private String productCode; 17 | private String createOp; 18 | private BigDecimal outStockNum; 19 | 20 | public StockListDetail() { 21 | } 22 | 23 | public StockListDetail(int id, long createTime, String outStockCode, String productCode, String createOp, BigDecimal outStockNum) { 24 | this.id = id; 25 | this.createTime = createTime; 26 | this.outStockCode = outStockCode; 27 | this.productCode = productCode; 28 | this.createOp = createOp; 29 | this.outStockNum = outStockNum; 30 | } 31 | 32 | public int getId() { 33 | return id; 34 | } 35 | 36 | public void setId(int id) { 37 | this.id = id; 38 | } 39 | 40 | public long getCreateTime() { 41 | return createTime; 42 | } 43 | 44 | public void setCreateTime(long createTime) { 45 | this.createTime = createTime; 46 | } 47 | 48 | public String getOutStockCode() { 49 | return outStockCode; 50 | } 51 | 52 | public void setOutStockCode(String outStockCode) { 53 | this.outStockCode = outStockCode; 54 | } 55 | 56 | public String getProductCode() { 57 | return productCode; 58 | } 59 | 60 | public void setProductCode(String productCode) { 61 | this.productCode = productCode; 62 | } 63 | 64 | public String getCreateOp() { 65 | return createOp; 66 | } 67 | 68 | public void setCreateOp(String createOp) { 69 | this.createOp = createOp; 70 | } 71 | 72 | public BigDecimal getOutStockNum() { 73 | return outStockNum; 74 | } 75 | 76 | public void setOutStockNum(BigDecimal outStockNum) { 77 | this.outStockNum = outStockNum; 78 | } 79 | 80 | @Override 81 | public String toString() { 82 | return "StockListDetail{" + 83 | "id=" + id + 84 | ", createTime=" + createTime + 85 | ", outStockCode='" + outStockCode + '\'' + 86 | ", productCode='" + productCode + '\'' + 87 | ", createOp='" + createOp + '\'' + 88 | ", outStockNum=" + outStockNum + 89 | '}'; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/trigger/ProcessWindowForTrigger.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.trigger 2 | 3 | import java.io.File 4 | import java.text.SimpleDateFormat 5 | import com.venn.common.Common 6 | import com.venn.util.CheckpointUtil 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema 8 | import org.apache.flink.api.scala._ 9 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 10 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction 11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows 12 | import org.apache.flink.streaming.api.windowing.time.Time 13 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 14 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 15 | import org.apache.flink.util.Collector 16 | import org.slf4j.LoggerFactory 17 | 18 | /** 19 | * for test CountAndContinuousProcessTimeTrigger 20 | * 21 | */ 22 | object ProcessWindowDemoForTrigger { 23 | val logger = LoggerFactory.getLogger(this.getClass) 24 | 25 | def main(args: Array[String]): Unit = { 26 | // environment 27 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 28 | env.setParallelism(1) 29 | if ("\\".equals(File.pathSeparator)) { 30 | // val rock = new RocksDBStateBackend(Common.CHECK_POINT_DATA_DIR) 31 | // env.setStateBackend(rock) 32 | // checkpoint interval 33 | // env.enableCheckpointing(10000) 34 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 35 | } 36 | 37 | val topic = "current_day" 38 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") 39 | 40 | val kafkaSource = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), Common.getProp) 41 | val stream = env.addSource(kafkaSource) 42 | .map(s => { 43 | s 44 | }) 45 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(60))) 46 | .trigger(CountAndTimeTrigger.of(10, Time.seconds(10))) 47 | .process(new ProcessAllWindowFunction[String, String, TimeWindow] { 48 | 49 | override def process(context: Context, elements: Iterable[String], out: Collector[String]): Unit = { 50 | 51 | var count = 0 52 | 53 | elements.iterator.foreach(s => { 54 | count += 1 55 | }) 56 | logger.info("this trigger have : {} item", count) 57 | } 58 | 59 | }) 60 | 61 | // execute job 62 | env.execute(this.getClass.getName) 63 | } 64 | 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/dynamicWindow/DyProcessWindowFunction.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.dynamicWindow 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | 6 | import com.google.gson.Gson 7 | import org.apache.flink.configuration.Configuration 8 | import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction 9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 10 | import org.apache.flink.util.Collector 11 | import org.slf4j.LoggerFactory 12 | 13 | class DyProcessWindowFunction() extends ProcessWindowFunction[(DataEntity, Command), String, String, TimeWindow] { 14 | 15 | val logger = LoggerFactory.getLogger("DyProcessWindowFunction") 16 | var gson: Gson = _ 17 | 18 | 19 | override def open(parameters: Configuration): Unit = { 20 | gson = new Gson() 21 | } 22 | 23 | override def process(key: String, context: Context, elements: Iterable[(DataEntity, Command)], out: Collector[String]): Unit = { 24 | // start-end 25 | val taskId = elements.head._2.taskId 26 | val method = elements.head._2.method 27 | val targetAttr = elements.head._2.targetAttr 28 | val periodStartTime = context.window.getStart 29 | val periodEndTime = context.window.getEnd 30 | 31 | var value: Double = 0d 32 | method match { 33 | case "sum" => 34 | value = 0d 35 | case "min" => 36 | value = Double.MaxValue 37 | case "max" => 38 | value = Double.MinValue 39 | case _ => 40 | logger.warn("input method exception") 41 | return 42 | } 43 | 44 | val it = elements.toIterator 45 | while (it.hasNext) { 46 | val currentValue = it.next()._1.value 47 | method match { 48 | case "sum" => 49 | value += currentValue 50 | case "count" => 51 | value += 1 52 | case "min" => 53 | if (currentValue < value) { 54 | value = currentValue 55 | } 56 | case "max" => 57 | if (currentValue > value) { 58 | value = currentValue 59 | } 60 | case _ => 61 | } 62 | } 63 | 64 | val sdf = new SimpleDateFormat("HH:mm:ss") 65 | val resultMap = new util.HashMap[String, String] 66 | resultMap.put("taskId", taskId) 67 | resultMap.put("method", method) 68 | resultMap.put("targetAttr", targetAttr) 69 | resultMap.put("periodStartTime", sdf.format(periodStartTime)) 70 | resultMap.put("periodEndTime", sdf.format(periodEndTime)) 71 | resultMap.put("value", value.toString) 72 | 73 | out.collect(gson.toJson(resultMap)) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/mysql/cdc/MySqlBinlogSourceExample.java: -------------------------------------------------------------------------------- 1 | package com.venn.source.mysql.cdc; 2 | 3 | import com.ververica.cdc.connectors.mysql.source.MySqlSource; 4 | import com.ververica.cdc.connectors.mysql.table.StartupOptions; 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | 8 | import java.util.Properties; 9 | 10 | /** 11 | * mysql cdc demo 12 | */ 13 | public class MySqlBinlogSourceExample { 14 | public static void main(String[] args) throws Exception { 15 | 16 | String ip = "10.201.0.166"; 17 | int port = 3306; 18 | String dbReg = "deepexi.*"; 19 | String tableReg = "[deepexi|dolphinscheduler].*"; 20 | String user = "root"; 21 | String pass = "daas2020"; 22 | 23 | // caev 24 | ip = "10.1.8.43"; 25 | dbReg = "order_pro"; 26 | tableReg = "order_opay_info"; 27 | pass = "enc(1C0F4C32D822B87CB4D8AC91246BFD64)"; 28 | 29 | 30 | 31 | 32 | String bootstrapServer = "dcmp12:9092"; 33 | 34 | if (args.length > 6) { 35 | ip = args[0]; 36 | port = Integer.parseInt(args[1]); 37 | dbReg = args[2]; 38 | // tableReg = args[3]; 39 | user = args[4]; 40 | pass = args[5]; 41 | } 42 | 43 | 44 | Properties prop = new Properties(); 45 | MySqlSource sourceFunction = MySqlSource.builder() 46 | .hostname(ip) 47 | .port(port) 48 | // 获取两个数据库的所有表 49 | .databaseList(dbReg) 50 | .tableList(tableReg) 51 | .username(user) 52 | .password(pass) 53 | .startupOptions(StartupOptions.latest()) 54 | // 自定义 解析器,讲数据解析成 json 55 | .deserializer(new CommonStringDebeziumDeserializationSchema(ip, port)) 56 | .debeziumProperties(prop) 57 | .build(); 58 | 59 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 60 | env.setParallelism(1); 61 | env 62 | .fromSource(sourceFunction, WatermarkStrategy.noWatermarks(), "cdc") 63 | .map(str -> str) 64 | .filter(str -> str.contains("DD012209160741922731")) 65 | .print(); 66 | // 将数据发送到不同的 topic 67 | // .addSink(new CommonKafkaSink(bootstrapServer)) 68 | // .setParallelism(1); 69 | 70 | env.execute(); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/AsyncFunctionForMysqlJava.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | import org.apache.flink.configuration.Configuration; 4 | import org.apache.flink.streaming.api.functions.async.ResultFuture; 5 | import org.apache.flink.streaming.api.functions.async.RichAsyncFunction; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.ArrayList; 10 | import java.util.Collections; 11 | import java.util.List; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | 15 | public class AsyncFunctionForMysqlJava extends RichAsyncFunction { 16 | 17 | 18 | Logger logger = LoggerFactory.getLogger(AsyncFunctionForMysqlJava.class); 19 | private transient MysqlClient client; 20 | private transient ExecutorService executorService; 21 | 22 | /** 23 | * open 方法中初始化链接 24 | * 25 | * @param parameters 26 | * @throws Exception 27 | */ 28 | @Override 29 | public void open(Configuration parameters) throws Exception { 30 | logger.info("async function for mysql java open ..."); 31 | super.open(parameters); 32 | 33 | client = new MysqlClient(); 34 | executorService = Executors.newFixedThreadPool(30); 35 | } 36 | 37 | /** 38 | * use asyncUser.getId async get asyncUser phone 39 | * 40 | * @param asyncUser 41 | * @param resultFuture 42 | * @throws Exception 43 | */ 44 | @Override 45 | public void asyncInvoke(AsyncUser asyncUser, ResultFuture resultFuture) throws Exception { 46 | 47 | executorService.submit(() -> { 48 | // submit query 49 | System.out.println("submit query : " + asyncUser.getId() + "-1-" + System.currentTimeMillis()); 50 | AsyncUser tmp = client.query1(asyncUser); 51 | // 一定要记得放回 resultFuture,不然数据全部是timeout 的 52 | resultFuture.complete(Collections.singletonList(tmp)); 53 | }); 54 | } 55 | 56 | @Override 57 | public void timeout(AsyncUser input, ResultFuture resultFuture) throws Exception { 58 | logger.warn("Async function for hbase timeout"); 59 | List list = new ArrayList(); 60 | input.setPhone("timeout"); 61 | list.add(input); 62 | resultFuture.complete(list); 63 | } 64 | 65 | /** 66 | * close function 67 | * 68 | * @throws Exception 69 | */ 70 | @Override 71 | public void close() throws Exception { 72 | logger.info("async function for mysql java close ..."); 73 | super.close(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/late1mtps/LateTpsProcessWindowFunction.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.late1mtps 2 | 3 | import com.venn.util.DateTimeUtil 4 | import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} 5 | import org.apache.flink.configuration.Configuration 6 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction 7 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 8 | import org.apache.flink.util.Collector 9 | 10 | import java.util 11 | 12 | /** 13 | * 整分钟输出间隔的窗口 14 | * @param windowSize 15 | * @param intervalSize 16 | */ 17 | class FixedLateTpsProcessAllWindowFunction(windowSize: Int, intervalSize: Int) extends ProcessAllWindowFunction[(String, Long), (String, String, Int, Double), TimeWindow] { 18 | 19 | // for last window, last senond 20 | var lastWindow: ValueState[Double] = _ 21 | var interval: Int = _ 22 | 23 | override def open(parameters: Configuration): Unit = { 24 | 25 | // windowState = getRuntimeContext.getMapState(new MapStateDescriptor[Int, Long]("window", classOf[Int], classOf[Long])) 26 | lastWindow = getRuntimeContext.getState(new ValueStateDescriptor[Double]("last", classOf[Double])) 27 | 28 | interval = windowSize / intervalSize 29 | } 30 | 31 | override def process(context: Context, elements: Iterable[(String, Long)], out: Collector[(String, String, Int, Double)]): Unit = { 32 | 33 | // get window 34 | val windowStart = DateTimeUtil.formatMillis(context.window.getStart, DateTimeUtil.YYYY_MM_DD_HH_MM_SS) 35 | val windowEnd = DateTimeUtil.formatMillis(context.window.getEnd, DateTimeUtil.YYYY_MM_DD_HH_MM_SS) 36 | var lastWindowCount = lastWindow.value() 37 | if (lastWindowCount == null) { 38 | lastWindowCount = 0 39 | } 40 | 41 | // init tps map 42 | val map = new util.HashMap[Int, Long]() 43 | for (_ <- 0 to windowSize - 1) { 44 | map.put(0, 0) 45 | } 46 | 47 | // for each element, get every window size 48 | elements.foreach((e: (String, Long)) => { 49 | val current: Int = (e._2 / 1000 % interval).toInt 50 | map.put(current, map.get(current) + 1) 51 | }) 52 | 53 | // for every zero window, out last window count 54 | out.collect(windowStart, windowEnd, 0, lastWindowCount) 55 | for (i <- 0 until interval - 1) { 56 | out.collect(windowStart, windowEnd, i + 1, map.get(i + 1) / 60.0) 57 | } 58 | 59 | // keep window last minute count as next window zero window count 60 | lastWindow.update(map.get(interval - 1) / 60.0) 61 | 62 | } 63 | 64 | override def close(): Unit = { 65 | lastWindow.clear() 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/AsyncMysqlRequest.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | import com.google.gson.Gson; 4 | import com.venn.common.Common; 5 | import org.apache.flink.formats.json.JsonNodeDeserializationSchema; 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 7 | import org.apache.flink.streaming.api.TimeCharacteristic; 8 | import org.apache.flink.streaming.api.datastream.AsyncDataStream; 9 | import org.apache.flink.streaming.api.datastream.DataStream; 10 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 11 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor; 12 | import org.apache.flink.streaming.api.windowing.time.Time; 13 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 14 | 15 | import java.util.concurrent.TimeUnit; 16 | 17 | 18 | public class AsyncMysqlRequest { 19 | 20 | 21 | public static void main(String[] args) throws Exception { 22 | 23 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 24 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 25 | FlinkKafkaConsumer source = new FlinkKafkaConsumer<>("async", new JsonNodeDeserializationSchema(), Common.getProp()); 26 | source.setStartFromLatest(); 27 | 28 | // 接收kafka数据,转为User 对象 29 | DataStream input = env.addSource(source) 30 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor(Time.seconds(60)) { 31 | @Override 32 | public long extractTimestamp(ObjectNode element) { 33 | return element.get("id").asLong(0) + 1000; 34 | } 35 | }) 36 | .map(value -> { 37 | String id = value.get("id").asText(); 38 | String username = value.get("username").asText(); 39 | String password = value.get("password").asText(); 40 | 41 | return new AsyncUser(id, username, password); 42 | }); 43 | // 异步IO 获取mysql数据, timeout 时间 1s,容量 100(超过100个请求,会反压上游节点) 44 | DataStream async = AsyncDataStream 45 | .unorderedWait(input, 46 | new AsyncFunctionForMysqlJava(), 47 | 1000, 48 | TimeUnit.MILLISECONDS, 49 | 10); 50 | 51 | async.map(user -> { 52 | return new Gson().toJson(user).toString(); 53 | }) 54 | .print(); 55 | 56 | env.execute("asyncForMysql"); 57 | 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/StreamingFileSinkDemo.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.connector.filesink 2 | // 3 | //import java.io.File 4 | //import java.text.SimpleDateFormat 5 | // 6 | //import com.venn.common.Common 7 | //import org.apache.flink.api.common.serialization.{BulkWriter, SimpleStringEncoder} 8 | //import org.apache.flink.api.scala._ 9 | //import org.apache.flink.core.fs.Path 10 | //import org.apache.flink.formats.json.JsonNodeDeserializationSchema 11 | //import org.apache.flink.runtime.state.filesystem.FsStateBackend 12 | //import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 13 | //import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink 14 | //import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 15 | //import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 16 | //import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 17 | // 18 | //object StreamingFileSinkDemo { 19 | // 20 | // def main(args: Array[String]): Unit = { 21 | // 22 | // val env = StreamExecutionEnvironment.getExecutionEnvironment 23 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 24 | // if ("/".equals(File.separator)) { 25 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 26 | // env.setStateBackend(backend) 27 | // env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 28 | // } else { 29 | // env.setMaxParallelism(1) 30 | // env.setParallelism(1) 31 | // } 32 | // 33 | // val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 34 | // val source = new FlinkKafkaConsumer[ObjectNode]("roll_file_sink", new JsonNodeDeserializationSchema, Common.getProp) 35 | // // row format 36 | // val sinkRow = StreamingFileSink 37 | // .forRowFormat(new Path("D:\\idea_out\\rollfilesink"), new SimpleStringEncoder[ObjectNode]("UTF-8")) 38 | // .withBucketAssigner(new DayBucketAssigner) 39 | // .withBucketCheckInterval(60 * 60 * 1000l) // 1 hour 40 | // .build() 41 | // 42 | // // use define BulkWriterFactory and DayBucketAssinger 43 | // val sinkBuck = StreamingFileSink 44 | // .forBulkFormat(new Path("D:\\idea_out\\rollfilesink"), new DayBulkWriterFactory) 45 | // .withBucketAssigner(new DayBucketAssigner()) 46 | // .withBucketCheckInterval(60 * 60 * 1000l) // 1 hour 47 | // .build() 48 | // 49 | // 50 | // env.addSource(source) 51 | // .assignAscendingTimestamps(json => { 52 | // sdf.parse(json.get("date").asText()).getTime 53 | // }) 54 | // .map(json => { 55 | //// json.get("date") + "-" + json.toString 56 | // json 57 | // }) 58 | // .addSink(sinkBuck) 59 | // 60 | // env.execute("StreamingFileSink") 61 | // } 62 | // 63 | //} 64 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/dynamicWindow/readme.md: -------------------------------------------------------------------------------- 1 | # Dynamic Window Staticstics 2 | 3 | 4 | 用 Flink 实现一个动态窗口统计的功能,使用 flink 1.10.0。实现的功能包括: 5 | 6 | 7 | ## 1. 定义命令流Source,格式 8 | { 9 | 'taskId': '任务id', 10 | 'targetAttr': '要统计的属性', 11 | 'method': '统计方法,有 SUM 求和,MAX 最大值, MIN 最小值三种' 12 | 'periodUnit': '统计周期任务,有 SECOND 和 MINUTE 两个值', 13 | 'periodLength': '周期的长度,数值', 14 | 'startTime': '任务开始的UNIX时间戳,单位毫秒' 15 | } 16 | 17 | 18 | 如: 19 | { 20 | 'taskId': 'task1', 21 | 'targetAttr': 'attr1', 22 | 'method': 'SUM' 23 | 'periodUnit': 'MINUTE', 24 | 'periodLength': '3', 25 | 'startTime': '1598596980000' 26 | } 27 | 表示 从 2020/8/28 14:43:00 开始统计属性attr1每三分钟的和 28 | 29 | 30 | 题目要求命令流发送4条数据,固定为下: 31 | { 32 | 'taskId': 'task1', 33 | 'targetAttr': 'attr1', 34 | 'method': 'SUM' 35 | 'periodUnit': 'SECOND', 36 | 'periodLength': '30', 37 | 'startTime': '1598596980000' 38 | } 39 | 40 | 41 | { 42 | 'taskId': 'task2', 43 | 'targetAttr': 'attr1', 44 | 'method': 'SUM' 45 | 'periodUnit': 'MINUTE', 46 | 'periodLength': '1', 47 | 'startTime': '1598596980000' 48 | } 49 | 50 | 51 | { 52 | 'taskId': 'task3', 53 | 'targetAttr': 'attr2', 54 | 'method': 'MAX' 55 | 'periodUnit': 'SECOND', 56 | 'periodLength': '30', 57 | 'startTime': '1598596980000' 58 | } 59 | 60 | 61 | { 62 | 'taskId': 'task4', 63 | 'targetAttr': 'attr3', 64 | 'method': 'MAX' 65 | 'periodUnit': 'MINUTE', 66 | 'periodLength': '2', 67 | 'startTime': '1598596980000' 68 | } 69 | 70 | ```text 71 | {"taskId":"task1","targetAttr":"attr2","method":"sum","periodUnit":"SECOND","periodLength":"20","startTime":"1598596980000"} 72 | {"taskId":"task2","targetAttr":"attr1","method":"sum","periodUnit":"MINUTE","periodLength":"1","startTime":"1598596980000"} 73 | {"taskId":"task3","targetAttr":"attr2","method":"max","periodUnit":"SECOND","periodLength":"30","startTime":"1598596980000"} 74 | {"taskId":"task4","targetAttr":"attr3","method":"min","periodUnit":"MINUTE","periodLength":"1","startTime":"1599640669628"} 75 | 76 | ``` 77 | 78 | 79 | ## 2. 定义数据流Source,格式如下: 80 | { 81 | "attr": '属性名', 82 | "value": double数值, 83 | "time": 'UNIX时间戳,单位毫秒' 84 | } 85 | 86 | 如: 87 | { 88 | 'attr': 'attr1', 89 | 'value': 35.0, 90 | 'time': '1598596980000' 91 | } 92 | 93 | 94 | 数据流需要每一秒发送4条数据,属性分别是 attr1、attr2、attr3 和 attr4,time使用当前unix毫秒时间戳,value使用 0~100的随机整数 95 | 96 | 97 | ## 3. 需要将命令流进行广播,然后和数据流进行connect,根据命令流指定的命令进行统计 98 | 99 | 统计参考涉及到的部分类或方法: DataStream.assignTimestampsAndWatermarks、keyBy、WindowAssigner、reduce、ProcessWindowFunction、addSink 100 | 101 | 102 | ## 4. 实现一个输出到终端的 sink,将统计结果打印出来,每一条记录包括 taskId, targetAttr, periodStartTime(周期开始时间), value (统计后的值,double类型) -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/UserClue/question.md: -------------------------------------------------------------------------------- 1 | ## 跟进记录 2 | ```csv 3 | 1、用户跟进记录表,分析用户跟进次数,取出每次跟进的 激活时间(状态2,3 中小的一条)和跟进失败时间(状态9) 4 | 2、每个用户跟进状态一定从 1 开始,直到 9 结束一次跟进 5 | 3、跟进状态可以重复和累加,不能递减 6 | 4、状态变化可以跳跃 7 | 5、激活时间为一次跟进过程中状态为 2 或 3 中时间小的一条记录 8 | 9 | 表结构、数据如下: 10 | 11 | create table clue_log( 12 | user_id bigint, 13 | create_time datetime, 14 | status int 15 | ) 16 | ENGINE=OLAP 17 | DUPLICATE KEY(user_id) 18 | DISTRIBUTED BY HASH(user_id) BUCKETS 8; 19 | 20 | insert into clue_log values 21 | (1,'2022-08-15 16:20:00', 1), 22 | (1,'2022-08-15 16:20:30', 2), 23 | (1,'2022-08-15 16:21:00', 3), 24 | (1,'2022-08-15 16:22:00', 4), 25 | (1,'2022-08-15 16:23:00', 5), 26 | (1,'2022-08-15 16:24:00', 9), 27 | 28 | (1,'2022-08-15 16:25:10', 1), 29 | (1,'2022-08-15 16:25:11', 1), 30 | (1,'2022-08-15 16:25:12', 1), 31 | (1,'2022-08-15 16:25:20', 3), 32 | (1,'2022-08-15 16:25:30', 4), 33 | (1,'2022-08-15 16:25:40', 5), 34 | (1,'2022-08-15 16:25:50', 9), 35 | 36 | (1,'2022-08-15 16:26:10', 1), 37 | (1,'2022-08-15 16:26:11', 2), 38 | (1,'2022-08-15 16:26:12', 2), 39 | (1,'2022-08-15 16:26:13', 2), 40 | (1,'2022-08-15 16:26:14', 3), 41 | (1,'2022-08-15 16:26:15', 4), 42 | (1,'2022-08-15 16:26:16', 5), 43 | (1,'2022-08-15 16:26:19', 9), 44 | 45 | (1,'2022-08-15 16:27:10', 1), 46 | (1,'2022-08-15 16:27:12', 3), 47 | (1,'2022-08-15 16:27:13', 3), 48 | (1,'2022-08-15 16:27:14', 3), 49 | (1,'2022-08-15 16:27:15', 4), 50 | (1,'2022-08-15 16:27:16', 5), 51 | 52 | (1,'2022-08-15 16:27:19', 9); 53 | 54 | 55 | insert into clue_log values 56 | (2,'2022-08-15 16:20:00', 1), 57 | (2,'2022-08-15 16:20:30', 2), 58 | (2,'2022-08-15 16:21:00', 3), 59 | (2,'2022-08-15 16:22:00', 4), 60 | (2,'2022-08-15 16:23:00', 5), 61 | (2,'2022-08-15 16:24:00', 9), 62 | (2,'2022-08-15 16:25:10', 1), 63 | (2,'2022-08-15 16:25:20', 3), 64 | (2,'2022-08-15 16:25:30', 4), 65 | (2,'2022-08-15 16:25:40', 5), 66 | (2,'2022-08-15 16:25:50', 9), 67 | (2,'2022-08-15 16:26:10', 1), 68 | (2,'2022-08-15 16:26:11', 2), 69 | (2,'2022-08-15 16:26:12', 2), 70 | (2,'2022-08-15 16:26:13', 2), 71 | (2,'2022-08-15 16:26:14', 3), 72 | (2,'2022-08-15 16:26:15', 4), 73 | (2,'2022-08-15 16:26:16', 5), 74 | (2,'2022-08-15 16:26:19', 9), 75 | (2,'2022-08-15 16:27:10', 1), 76 | (2,'2022-08-15 16:27:12', 3), 77 | (2,'2022-08-15 16:27:13', 3), 78 | (2,'2022-08-15 16:27:14', 3), 79 | (2,'2022-08-15 16:27:15', 4), 80 | (2,'2022-08-15 16:27:16', 5), 81 | (2,'2022-08-15 16:27:19', 9); 82 | 83 | select user_id, active_time,fail_time 84 | from ( 85 | select a.user_id, min(if(b.status in (2,3) , b.create_time, null)) active_time, min(if(b.status in (9) , b.create_time, null)) fail_time 86 | from clue_log a 87 | left join clue_log b on a.user_id = b.user_id and b.create_time >= a.create_time 88 | where a.status = 1 89 | group by a.user_id, a.create_time 90 | )a 91 | where fail_time is not null 92 | group by user_id, active_time,fail_time 93 | ; 94 | 95 | ``` -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/kafka/KafkaUpsertTableSink.java: -------------------------------------------------------------------------------- 1 | ///* 2 | // * Licensed to the Apache Software Foundation (ASF) under one or more 3 | // * contributor license agreements. See the NOTICE file distributed with 4 | // * this work for additional information regarding copyright ownership. 5 | // * The ASF licenses this file to You under the Apache License, Version 2.0 6 | // * (the "License"); you may not use this file except in compliance with 7 | // * the License. You may obtain a copy of the License at 8 | // * 9 | // * http://www.apache.org/licenses/LICENSE-2.0 10 | // * 11 | // * Unless required by applicable law or agreed to in writing, software 12 | // * distributed under the License is distributed on an "AS IS" BASIS, 13 | // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // * See the License for the specific language governing permissions and 15 | // * limitations under the License. 16 | // */ 17 | // 18 | //package com.venn.source.kafka; 19 | // 20 | //import org.apache.flink.annotation.Internal; 21 | //import org.apache.flink.api.common.serialization.SerializationSchema; 22 | //import org.apache.flink.api.common.typeinfo.TypeInformation; 23 | //import org.apache.flink.streaming.api.functions.sink.SinkFunction; 24 | //import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; 25 | //import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper; 26 | //import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; 27 | //import org.apache.flink.table.api.TableSchema; 28 | //import org.apache.flink.types.Row; 29 | // 30 | //import java.util.Optional; 31 | //import java.util.Properties; 32 | // 33 | ///** 34 | // * Kafka table sink for writing data into Kafka. 35 | // */ 36 | //@Internal 37 | //public class KafkaUpsertTableSink extends KafkaUpsertTableSinkBase{ 38 | // 39 | // public KafkaUpsertTableSink( 40 | // TableSchema schema, 41 | // String topic, 42 | // Properties properties, 43 | // Optional> partitioner, 44 | // SerializationSchema serializationSchema) { 45 | // 46 | // super(schema, topic, properties, partitioner, serializationSchema); 47 | // } 48 | // 49 | // @Override 50 | // protected SinkFunction createKafkaProducer( 51 | // String topic, 52 | // Properties properties, 53 | // SerializationSchema serializationSchema, 54 | // Optional> partitioner) { 55 | // // 很难理解 ,为什么内部版本用标记过期的构造器,明明有不过期的 56 | // return new FlinkKafkaProducer<>( 57 | // topic, 58 | // new KeyedSerializationSchemaWrapper<>(serializationSchema), 59 | // properties, 60 | // partitioner); 61 | // } 62 | // 63 | // @Override 64 | // public void setKeyFields(String[] keys) { 65 | // 66 | // } 67 | // 68 | // @Override 69 | // public void setIsAppendOnly(Boolean isAppendOnly) { 70 | // // todo just follow HBaseUpsertTableSink 71 | // } 72 | // 73 | // @Override 74 | // public TypeInformation getRecordType() { 75 | // return TypeInformation.of(Row.class); 76 | // } 77 | //} 78 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/demo/relationCntA.scala: -------------------------------------------------------------------------------- 1 | package com.venn.demo 2 | 3 | import org.apache.flink.streaming.api.functions.AssignerWithPeriodicWatermarks 4 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 5 | import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction 6 | import org.apache.flink.streaming.api.watermark.Watermark 7 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 8 | import org.apache.flink.streaming.api.windowing.time.Time 9 | import org.apache.flink.streaming.api.windowing.triggers.ContinuousEventTimeTrigger 10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 11 | import org.apache.flink.util.Collector 12 | 13 | import scala.collection.mutable.ListBuffer 14 | 15 | 16 | /** 17 | * 球哥 18 | */ 19 | object relationCntA { 20 | def main(args: Array[String]): Unit = { 21 | 22 | val windowTime = TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8)) 23 | val triggerInterval = 40 24 | var backendFilePath = "" 25 | val parallelism = 1 26 | val evictorTime = 40 27 | backendFilePath = "hdfs:/tmp/relation" //存储checkpoint数据,//fs状态后端配置,如为file:///,则在taskmanager的本地 28 | val env=StreamExecutionEnvironment.getExecutionEnvironment 29 | val endStream=env.addSource(new CustomerSource) 30 | 31 | //先做条件过滤 32 | 33 | val outStream = endStream 34 | .assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks[Tuple2[Long,Long]] { 35 | var currentMaxTimestamp = 0L 36 | val maxOutOfOrderness = 2000L //2秒 37 | var lastEmittedWatermark: Long = Long.MinValue 38 | 39 | override def extractTimestamp(t: Tuple2[Long,Long], l: Long): Long = { 40 | val timestamp = t._1 41 | println("---------2---timestamp--------" + timestamp) 42 | if (timestamp > currentMaxTimestamp) { 43 | currentMaxTimestamp = timestamp 44 | } 45 | timestamp 46 | } 47 | override def getCurrentWatermark: Watermark = { 48 | 49 | //允许延迟2秒 50 | val potentialWM = currentMaxTimestamp - maxOutOfOrderness 51 | if (potentialWM >= lastEmittedWatermark) { 52 | lastEmittedWatermark = potentialWM -1 53 | } 54 | new Watermark(lastEmittedWatermark) 55 | } 56 | 57 | }) 58 | .keyBy(data => data._2) 59 | .window(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))) //统计今天内的数据量 60 | .trigger(ContinuousEventTimeTrigger.of(Time.seconds(10))) 61 | //.evictor(TimeEvictor.of(Time.seconds(evictorTime), true)) 62 | .process(new MyProcessWindowFunction) 63 | 64 | 65 | env.execute("kafka test") 66 | } 67 | } 68 | class MyProcessWindowFunction extends ProcessWindowFunction[(Long, Long), (String, Long), Long, TimeWindow] { 69 | 70 | // 一个窗口结束的时候调用一次(一个分组执行一次),不适合大量数据,全量数据保存在内存中,会造成内存溢出 71 | override def process(key: Long, context: Context, elements: Iterable[(Long, Long)], out: Collector[(String, Long)]): Unit = { 72 | // 聚合,注意:整个窗口的数据保存到Iterable,里面有很多行数据, Iterable的size就是日志的总行数 73 | println("dddddddddddddddddd") 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/AsyncFunctionForHbaseJava.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | 4 | import org.apache.flink.configuration.Configuration; 5 | import org.apache.flink.streaming.api.functions.async.ResultFuture; 6 | import org.apache.flink.streaming.api.functions.async.RichAsyncFunction; 7 | import org.apache.hadoop.hbase.HBaseConfiguration; 8 | import org.apache.hadoop.hbase.HConstants; 9 | import org.apache.hadoop.hbase.TableName; 10 | import org.apache.hadoop.hbase.client.*; 11 | import org.apache.hadoop.hbase.util.Bytes; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | public class AsyncFunctionForHbaseJava extends RichAsyncFunction { 19 | 20 | Table table = null; 21 | Logger logger = LoggerFactory.getLogger(AsyncFunctionForHbaseJava.class); 22 | @Override 23 | public void open(Configuration parameters) throws Exception { 24 | logger.info("async function for hbase java open ..."); 25 | super.open(parameters); 26 | org.apache.hadoop.conf.Configuration config = HBaseConfiguration.create(); 27 | 28 | config.set(HConstants.ZOOKEEPER_QUORUM, "venn"); 29 | config.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181"); 30 | config.setInt(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, 30000); 31 | config.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 30000); 32 | 33 | TableName tableName = TableName.valueOf("async"); 34 | Connection conn = ConnectionFactory.createConnection(config); 35 | table = conn.getTable(tableName); 36 | } 37 | 38 | 39 | /** 40 | * use asyncUser.getId get asyncUser phone 41 | * @param asyncUser 42 | * @param resultFuture 43 | * @throws Exception 44 | */ 45 | @Override 46 | public void asyncInvoke(AsyncUser asyncUser, ResultFuture resultFuture) throws Exception { 47 | 48 | Get get = new Get(asyncUser.getId().getBytes()); 49 | get.addColumn("cf".getBytes(), "phone".getBytes()); 50 | 51 | Result result = table.get(get); 52 | 53 | String phone = Bytes.toString(result.getValue("cf".getBytes(), "phone".getBytes())); 54 | 55 | if ( phone ==null || phone.length() != 11){ 56 | phone = "00000000000"; 57 | } 58 | asyncUser.setPhone(phone); 59 | List list = new ArrayList(); 60 | list.add(asyncUser); 61 | resultFuture.complete(list); 62 | } 63 | 64 | @Override 65 | public void timeout(AsyncUser input, ResultFuture resultFuture) throws Exception { 66 | logger.info("Async function for hbase timeout"); 67 | List list = new ArrayList(); 68 | input.setPhone("00000000001"); 69 | list.add(input); 70 | resultFuture.complete(list); 71 | 72 | } 73 | 74 | /** 75 | * close function 76 | * @throws Exception 77 | */ 78 | @Override 79 | public void close() throws Exception { 80 | logger.info("async function for hbase java close ..."); 81 | super.close(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/filesink/RollingFileSinkDemo.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.connector.filesink 2 | // 3 | //import java.io.File 4 | //import java.text.SimpleDateFormat 5 | // 6 | //import com.venn.common.Common 7 | //import org.apache.flink.formats.json.JsonNodeDeserializationSchema 8 | //import org.apache.flink.runtime.state.filesystem.FsStateBackend 9 | //import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 10 | //import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 11 | //import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 12 | //import org.apache.flink.streaming.connectors.fs.StringWriter 13 | //import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink 14 | //import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 15 | //import org.apache.flink.api.scala._ 16 | // 17 | ///** 18 | // * 使用BucketingSink 实现 根据‘数据’自定义输出目录 19 | // */ 20 | //object RollingFileSinkDemo { 21 | // 22 | // def main(args: Array[String]): Unit = { 23 | // 24 | // val env = StreamExecutionEnvironment.getExecutionEnvironment 25 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 26 | // if ("/".equals(File.separator)) { 27 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 28 | // env.setStateBackend(backend) 29 | // env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 30 | // } else { 31 | // env.setMaxParallelism(1) 32 | // env.setParallelism(1) 33 | // } 34 | // 35 | // val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 36 | // val source = new FlinkKafkaConsumer[ObjectNode]("roll_file_sink", new JsonNodeDeserializationSchema, Common.getProp) 37 | // 38 | // /** 39 | // * 这里有个问题,因为重写了BasePathBucketer,自定义了输出文件, 40 | // * 所有会同时打开多个输出文件,带来文件刷新的问题,在当前文件写完后(这里的表现是:当天 41 | // * 的数据以及全部流过,下一天的文件以及开始写了),会发现 42 | // * 当天的文件中的数据不全,因为数据还没有全部刷到文件,这个时候下一个文件 43 | // * 又开始写了,会发现上一个文件还没刷完。 44 | // * 45 | // * 猜想:每个文件都有个输出缓冲,上一个文件最后一点数据还在缓冲区,下一个文件 46 | // * 又使用新的缓冲区,没办法刷到上一个文件的数据,只有等缓冲区数据满、超时一类的操作触发刷写 ?? 47 | // * 48 | // * 源码BucketingSink.closePartFilesByTime 49 | // * 默认每60秒或大于滚动时间间隔(batchRolloverInterval)(系统时间) 将当前park文件, 50 | // * 将状态从 in-process 修改为 pending,随后 51 | // * 关闭当前的part 文件,数据刷到磁盘 52 | // * 53 | // */ 54 | // val sink = new BucketingSink[String]("D:\\idea_out\\rollfilesink") 55 | // sink.setBucketer(new DayBasePathBucketer) 56 | // sink.setWriter(new StringWriter[String]) 57 | // sink.setBatchSize(1024 * 1024 * 400) // this is 400 MB, 58 | // // sink.setBatchRolloverInterval(24 * 60 * 60 * 1000) // this is 24 hour 59 | //// sink.setInProgressPrefix("inProcessPre") 60 | //// sink.setPendingPrefix("pendingpre") 61 | //// sink.setPartPrefix("partPre") 62 | // 63 | // env.addSource(source) 64 | // .assignAscendingTimestamps(json => { 65 | // sdf.parse(json.get("date").asText()).getTime 66 | // }) 67 | // .map(json => { 68 | // json.get("date") + "-" + json.toString 69 | // }) 70 | // .addSink(sink) 71 | // 72 | // env.execute("rollingFileSink") 73 | // } 74 | // 75 | //} 76 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/checkpoint/CheckpointDebug.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.checkpoint 2 | 3 | import com.venn.common.Common 4 | import com.venn.demo.CustomerSource 5 | import com.venn.source.TumblingEventTimeWindows 6 | import com.venn.util.CheckpointUtil 7 | import org.apache.flink.api.common.functions.RichFlatJoinFunction 8 | import org.apache.flink.api.common.serialization.SimpleStringSchema 9 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 10 | import org.apache.flink.api.scala._ 11 | import org.apache.flink.configuration.Configuration 12 | import org.apache.flink.streaming.api.windowing.time.Time 13 | import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, FlinkKafkaProducer} 14 | import org.apache.flink.util.Collector 15 | import org.slf4j.LoggerFactory 16 | 17 | /** 18 | * for debug checkpoint 19 | */ 20 | object CheckpointDebug { 21 | val LOG = LoggerFactory.getLogger("CheckpointDebug") 22 | 23 | def main(args: Array[String]): Unit = { 24 | 25 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 26 | env.setParallelism(1) 27 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 60) 28 | 29 | val prop = Common.getProp() 30 | val kafkaSource1 = new FlinkKafkaConsumer[String]("source_1", new SimpleStringSchema(), prop) 31 | val kafkaSource2 = new FlinkKafkaConsumer[String]("source_2", new SimpleStringSchema(), prop) 32 | val source1 = env.addSource(kafkaSource1) 33 | .name("source1") 34 | 35 | val source2 = env.addSource(kafkaSource2) 36 | .name("source2") 37 | val map1 = source1.map(item => { 38 | val arr = item.split(",") 39 | ("map_1", arr(0).toLong, arr(1).toLong) 40 | }) 41 | .name("map1") 42 | 43 | val map2 = source2.map(item => { 44 | val arr = item.split(",") 45 | ("map_1", arr(0).toLong, arr(1).toLong) 46 | }) 47 | .name("map2") 48 | 49 | val join = map1.join(map2) 50 | .where(_._2) 51 | .equalTo(_._2) 52 | .window(TumblingEventTimeWindows.of(Time.minutes(1))) 53 | .apply(new RichFlatJoinFunction[(String, Long, Long), (String, Long, Long), (String, String, Long, Long)] { 54 | 55 | override def open(parameters: Configuration): Unit = { 56 | LOG.info("RichFlatJoinFunction open") 57 | 58 | } 59 | 60 | // join 61 | override def join(first: (String, Long, Long), second: (String, Long, Long), out: Collector[(String, String, Long, Long)]): Unit = { 62 | 63 | 64 | out.collect((first._1, second._1, first._2, first._3)) 65 | 66 | } 67 | 68 | override def close(): Unit = { 69 | LOG.info("RichFlatJoinFunction close") 70 | 71 | } 72 | }) 73 | .name("join") 74 | 75 | 76 | val kafkaSink = new FlinkKafkaProducer[String]("localhost:9092", "checkpoint_debug", new SimpleStringSchema()) 77 | val sink = join.map(item => { 78 | item._1 + "," + item._2 + "," + item._3 + "," + item._4 79 | }) 80 | .name("joinFormat") 81 | .addSink(kafkaSink) 82 | .name("sink") 83 | 84 | 85 | env.execute("checkpointDebug") 86 | 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/com/venn/flink/asyncio/MysqlClient.java: -------------------------------------------------------------------------------- 1 | package com.venn.flink.asyncio; 2 | 3 | 4 | import org.apache.flink.shaded.netty4.io.netty.channel.DefaultEventLoop; 5 | import org.apache.flink.shaded.netty4.io.netty.util.concurrent.Future; 6 | import org.apache.flink.shaded.netty4.io.netty.util.concurrent.SucceededFuture; 7 | 8 | import java.sql.DriverManager; 9 | import java.sql.PreparedStatement; 10 | import java.sql.ResultSet; 11 | import java.sql.SQLException; 12 | 13 | public class MysqlClient { 14 | 15 | private static String jdbcUrl = "jdbc:mysql://192.168.229.128:3306?useSSL=false&allowPublicKeyRetrieval=true"; 16 | private static String username = "root"; 17 | private static String password = "123456"; 18 | private static String driverName = "com.mysql.jdbc.Driver"; 19 | private static java.sql.Connection conn; 20 | private static PreparedStatement ps; 21 | 22 | static { 23 | try { 24 | Class.forName(driverName); 25 | conn = DriverManager.getConnection(jdbcUrl, username, password); 26 | ps = conn.prepareStatement("select phone from async.async_test where id = ?"); 27 | } catch (ClassNotFoundException | SQLException e) { 28 | e.printStackTrace(); 29 | } 30 | } 31 | 32 | /** 33 | * execute query 34 | * @param user 35 | * @return 36 | */ 37 | public AsyncUser query1(AsyncUser user) { 38 | 39 | try { 40 | Thread.sleep(10); 41 | } catch (InterruptedException e) { 42 | e.printStackTrace(); 43 | } 44 | 45 | String phone = "0000"; 46 | try { 47 | ps.setString(1, user.getId()); 48 | ResultSet rs = ps.executeQuery(); 49 | if (!rs.isClosed() && rs.next()) { 50 | phone = rs.getString(1); 51 | } 52 | System.out.println("execute query : " + user.getId() + "-2-" + "phone : " + phone +"-"+ System.currentTimeMillis()); 53 | } catch (SQLException e) { 54 | e.printStackTrace(); 55 | } 56 | user.setPhone(phone); 57 | return user; 58 | 59 | } 60 | 61 | public Future query2(AsyncUser user) { 62 | 63 | String phone = "0000"; 64 | try { 65 | ps.setString(1, user.getId()); 66 | ResultSet rs = ps.executeQuery(); 67 | System.out.println(user.getId() + "-3-" + System.currentTimeMillis()); 68 | if (rs.next()) { 69 | phone = rs.getString(1); 70 | } 71 | } catch ( 72 | SQLException e) { 73 | e.printStackTrace(); 74 | } 75 | user.setPhone(phone); 76 | return new SucceededFuture(new DefaultEventLoop(), user); 77 | 78 | } 79 | 80 | public static void main(String[] args) { 81 | MysqlClient mysqlClient = new MysqlClient(); 82 | 83 | AsyncUser asyncUser = new AsyncUser(); 84 | asyncUser.setId("526"); 85 | long start = System.currentTimeMillis(); 86 | asyncUser = mysqlClient.query1(asyncUser); 87 | 88 | System.out.println("end : " + (System.currentTimeMillis() - start)); 89 | System.out.println(asyncUser.toString()); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/cdcStarrocks/CdcToStarRocks.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.cdcStarrocks; 2 | 3 | import com.venn.source.mysql.cdc.CommonStringDebeziumDeserializationSchema; 4 | import com.ververica.cdc.connectors.mysql.source.MySqlSource; 5 | import com.ververica.cdc.connectors.mysql.table.StartupOptions; 6 | import org.apache.flink.api.common.eventtime.WatermarkStrategy; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | 9 | import java.util.Properties; 10 | 11 | /** 12 | * mysql cdc demo 13 | *

14 | * cdc 整库同步数据到 starrocks 15 | *

16 | * 局限: 17 | * 1. 还未实现 starrocks 端表结构跟随 源端表结构同步变更 18 | * 2. 为了保证效率,仅会在每一个表第一次来的时候判断目标段是否存在该表,如果已经判定该表不存在,后续直接忽略该表的数据变更 19 | * 3. 部分不导入的表,只在sink 的时候做了过滤,前面的操作还是要继续,可以考虑在 反序列化和map中过滤掉目标库中不存在的表数据 20 | */ 21 | public class CdcToStarRocks { 22 | 23 | // 每个批次最大条数和等待时间 24 | private static int batchSize = 10000; 25 | private static long batchInterval = 10 * 1000; 26 | 27 | public static void main(String[] args) throws Exception { 28 | 29 | String ip = "localhost"; 30 | int port = 3306; 31 | String db = "hive_3"; 32 | // String table = "venn.user_log,venn.user_log_1"; 33 | String table = "hive_3.*"; 34 | String user = "root"; 35 | String pass = "123456"; 36 | 37 | String starrocksIp = "10.201.0.230"; 38 | String starrocksPort = "29030"; 39 | String starrocksLoadPort = "28030"; 40 | String starrocksUser = "root"; 41 | String starrocksPass = "123456"; 42 | String starrocksDb = "test"; 43 | 44 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 45 | env.setParallelism(1); 46 | 47 | MySqlSource sourceFunction = MySqlSource.builder() 48 | .hostname(ip) 49 | .port(port) 50 | // 获取两个数据库的所有表 51 | .databaseList(db) 52 | .tableList(table) 53 | .username(user) 54 | .password(pass) 55 | .startupOptions(StartupOptions.latest()) 56 | // .startupOptions(StartupOptions.initial()) 57 | // do not cache schema change 58 | // .includeSchemaChanges(true) 59 | // 自定义 解析器,讲数据解析成 json 60 | .deserializer(new CommonStringDebeziumDeserializationSchema(ip, port)) 61 | .build(); 62 | 63 | env 64 | .fromSource(sourceFunction, WatermarkStrategy.noWatermarks(), "cdc") 65 | .name("source") 66 | .uid("source") 67 | // json 字符串转 CdcRecord 68 | .map(new CdcStarMapFunction()) 69 | .name("map") 70 | .keyBy(record -> record.getDb() + "_" + record.getTable()) 71 | .process(new CdcStarProcessFunction(batchSize, batchInterval)) 72 | .name("process") 73 | .uid("process") 74 | .print(); 75 | // .addSink(new StarRocksSink(starrocksIp, starrocksPort, starrocksLoadPort, starrocksUser, starrocksPass, starrocksDb)) 76 | // .name("sink"); 77 | 78 | env.execute("cdcToStarRocks"); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/tableJoin/CacheFile.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.tableJoin 2 | 3 | import java.io.File 4 | import java.text.SimpleDateFormat 5 | 6 | import com.venn.common.Common 7 | import com.venn.util.CheckpointUtil 8 | import org.apache.flink.api.scala._ 9 | import org.apache.flink.api.common.functions.RichMapFunction 10 | import org.apache.flink.configuration.Configuration 11 | import org.apache.flink.formats.json.JsonNodeDeserializationSchema 12 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 13 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 14 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 16 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 17 | 18 | import scala.io.Source 19 | 20 | /** 21 | * stream join read config from cache file 22 | * register at job start, never change again 23 | */ 24 | object CacheFile { 25 | 26 | def main(args: Array[String]): Unit = { 27 | 28 | val env = StreamExecutionEnvironment.getExecutionEnvironment 29 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 30 | if ("/".equals(File.separator)) { 31 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 32 | // env.setStateBackend(backend) 33 | // env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 34 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 35 | env.registerCachedFile("/opt/flink1.7/data/tablejoin.txt", "tablejoin.txt") 36 | } else { 37 | env.setMaxParallelism(1) 38 | env.setParallelism(1) 39 | // file and register name 40 | env.registerCachedFile("C:\\Users\\venn\\git\\venn\\flinkDemo\\src\\main\\resources\\data\\tablejoin.txt", "tablejoin.txt") 41 | } 42 | // cache table 43 | 44 | 45 | val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 46 | val source = new FlinkKafkaConsumer[ObjectNode]("table_join", new JsonNodeDeserializationSchema, Common.getProp) 47 | 48 | 49 | env.addSource(source) 50 | .map(json => { 51 | 52 | val id = json.get("id").asText() 53 | val phone = json.get("phone").asText() 54 | 55 | Tuple2(id, phone) 56 | }) 57 | .map(new RichMapFunction[(String, String), String] { 58 | 59 | var cache = Map("" -> "") 60 | 61 | override def open(parameters: Configuration): Unit = { 62 | 63 | // read cache file 64 | val file = getRuntimeContext.getDistributedCache.getFile("tablejoin.txt") 65 | if (file.canRead) { 66 | val context = Source.fromFile(file, "utf-8").getLines().toArray 67 | 68 | context.foreach(line => { 69 | val tmp = line.split(",") 70 | cache += (tmp(0) -> tmp(1)) 71 | }) 72 | } 73 | } 74 | 75 | override def map(value: (String, String)): String = { 76 | val name = cache.get(value._1) 77 | 78 | value._1 + "," + value._2 + "," + cache.get(value._1) 79 | } 80 | 81 | }) 82 | .print() 83 | 84 | env.execute("cacheFile") 85 | 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/starrocks/CustJdbcSource.java: -------------------------------------------------------------------------------- 1 | package com.venn.connector.starrocks; 2 | 3 | import org.apache.flink.configuration.Configuration; 4 | import org.apache.flink.metrics.Counter; 5 | import org.apache.flink.metrics.SimpleCounter; 6 | import org.apache.flink.streaming.api.functions.source.RichSourceFunction; 7 | 8 | import java.sql.Connection; 9 | import java.sql.DriverManager; 10 | import java.sql.PreparedStatement; 11 | import java.sql.ResultSet; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.Random; 15 | 16 | public class CustJdbcSource extends RichSourceFunction { 17 | 18 | private String ip; 19 | private String port; 20 | private String user; 21 | private String pass; 22 | private String sql; 23 | private String colSep; 24 | private int batch; 25 | private int interval; 26 | private boolean flag = false; 27 | private transient Counter counter; 28 | private Random random = new Random(); 29 | 30 | private List cache = new ArrayList<>(); 31 | 32 | public CustJdbcSource(String ip, String port, String user, String pass, String sql, String colSep, int batch, int interval) { 33 | this.ip = ip; 34 | this.port = port; 35 | this.user = user; 36 | this.pass = pass; 37 | this.sql = sql; 38 | this.colSep = colSep; 39 | this.batch = batch; 40 | this.interval = interval * 1000; 41 | } 42 | 43 | 44 | @Override 45 | public void open(Configuration parameters) throws Exception { 46 | flag = true; 47 | 48 | counter = new SimpleCounter(); 49 | this.counter = getRuntimeContext() 50 | .getMetricGroup() 51 | .counter("myCounter"); 52 | // load data 53 | 54 | String url = "jdbc:mysql://" + ip + ":" + port; 55 | 56 | 57 | Connection connection = DriverManager.getConnection(url, this.user, this.pass); 58 | 59 | PreparedStatement ps = connection.prepareStatement(sql); 60 | 61 | ResultSet rs = ps.executeQuery(); 62 | 63 | int columnCount = rs.getMetaData().getColumnCount(); 64 | 65 | while (rs.next()) { 66 | 67 | StringBuilder builder = new StringBuilder(); 68 | for (int j = 1; j <= columnCount; j++) { 69 | if (j == columnCount) { 70 | builder.append(rs.getString(j)); 71 | } else { 72 | builder.append(rs.getString(j)).append(this.colSep); 73 | } 74 | } 75 | 76 | cache.add(builder.toString()); 77 | } 78 | 79 | System.out.println("load cache size : " + cache.size()); 80 | 81 | } 82 | 83 | @Override 84 | public void run(SourceContext ctx) throws Exception { 85 | 86 | int dataSize = cache.size(); 87 | while (flag) { 88 | int select = random.nextInt(dataSize); 89 | 90 | String data = cache.get(select); 91 | 92 | counter.inc(); 93 | ctx.collect(data); 94 | 95 | if (counter.getCount() % batch == 0) { 96 | Thread.sleep(interval); 97 | } 98 | } 99 | 100 | } 101 | 102 | @Override 103 | public void cancel() { 104 | flag = false; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/dataFluctuation/DataFluctuation.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.dataFluctuation 2 | 3 | import com.google.gson.JsonParser 4 | import com.venn.entity.KafkaSimpleStringRecord 5 | import com.venn.util.{CheckpointUtil, DateTimeUtil, SimpleKafkaRecordDeserializationSchema} 6 | import org.apache.commons.lang.time.DateFormatUtils 7 | import org.apache.flink.api.common.eventtime.{Watermark, WatermarkGenerator, WatermarkGeneratorSupplier, WatermarkOutput, WatermarkStrategy} 8 | import org.apache.flink.api.common.functions.RichMapFunction 9 | import org.apache.flink.connector.kafka.source.KafkaSource 10 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 11 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 12 | import org.apache.flink.api.scala._ 13 | import org.apache.flink.configuration.Configuration 14 | 15 | /* 16 | 计算数据波动 17 | */ 18 | object DataFluctuation { 19 | 20 | def main(args: Array[String]): Unit = { 21 | 22 | val env = StreamExecutionEnvironment.getExecutionEnvironment 23 | env.setParallelism(1) 24 | 25 | val checkpointInterval = 60 * 1000 26 | val checkpointTimeOut = 2 * checkpointInterval 27 | val checkPointPath = "hdfs:///tmp/flink/checkpoint" 28 | val bootstrapServer = "localhost:9092" 29 | val topic = "user_log" 30 | 31 | // set checkpoint 32 | CheckpointUtil.setCheckpoint(env, "FileSystem", checkPointPath, checkpointInterval, checkpointTimeOut) 33 | 34 | val kafkaSource = KafkaSource 35 | .builder[KafkaSimpleStringRecord]() 36 | .setBootstrapServers(bootstrapServer) 37 | .setTopics(topic) 38 | .setDeserializer(new SimpleKafkaRecordDeserializationSchema) 39 | .setStartingOffsets(OffsetsInitializer.latest()) 40 | .build() 41 | 42 | val source = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource") 43 | 44 | val stream = source.map(new RichMapFunction[KafkaSimpleStringRecord, (String, Double, Long)] { 45 | var jsonParser: JsonParser = _ 46 | 47 | override def open(parameters: Configuration): Unit = { 48 | jsonParser = new JsonParser 49 | } 50 | 51 | override def map(element: KafkaSimpleStringRecord): (String, Double, Long) = { 52 | 53 | val json = jsonParser.parse(element.getValue).getAsJsonObject 54 | 55 | val item = json.get("item").getAsString 56 | val price = json.get("price").getAsDouble 57 | val tsStr = json.get("ts").getAsString 58 | val ts = DateTimeUtil.parse(tsStr).getTime 59 | 60 | (item, price, ts) 61 | } 62 | }) 63 | .name("map") 64 | .uid("map") 65 | 66 | stream 67 | .assignTimestampsAndWatermarks(WatermarkStrategy 68 | .forGenerator((_: WatermarkGeneratorSupplier.Context) => { 69 | new WatermarkGenerator[(String,Double, Long)] { 70 | var current = 0l 71 | override def onEvent(t: (String, Double, Long), l: Long, watermarkOutput: WatermarkOutput): Unit = { 72 | if(t._3 > current){ 73 | current = t._3 74 | watermarkOutput.emitWatermark(new Watermark(current)) 75 | } 76 | 77 | } 78 | 79 | override def onPeriodicEmit(watermarkOutput: WatermarkOutput): Unit = { 80 | // 81 | } 82 | } 83 | 84 | })) 85 | 86 | 87 | 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/kafka/KafkaSinkTest.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.kafka 2 | 3 | import com.venn.common.Common 4 | import com.venn.question.retention.RetentionAnalyze.bootstrapServer 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | import org.apache.flink.api.common.functions.RichFlatMapFunction 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema 8 | import org.apache.flink.api.scala._ 9 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink} 10 | import org.apache.flink.connector.kafka.source.KafkaSource 11 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 12 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 13 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer 14 | import org.apache.flink.util.Collector 15 | import org.slf4j.LoggerFactory 16 | 17 | /** 18 | * 19 | * 请教个问题哈,sink 到 kafka,采用默认的分区器,是不是每个并行度都会与kafka的partition维护一个连接 20 | 21 | 比如 10 个并行度,3个 partition,那么维护的连接数总共为 10*3 个

? 是的 22 | 23 | 还是一个taskManager建立一个生产者 一个生产者对应多个分区 24 | 25 | 一个taskManager里面多个slot共享一个生产者? no 26 | */ 27 | object KafkaSinkTest { 28 | 29 | val LOG = LoggerFactory.getLogger("KafkaSinkTest") 30 | 31 | def main(args: Array[String]): Unit = { 32 | 33 | val topic = "user_log" 34 | val sinkTopic = "user_log_sink_1" 35 | 36 | // env 37 | val env = StreamExecutionEnvironment.getExecutionEnvironment 38 | // global parllelism 39 | val parallelism = 4 40 | env.setParallelism(parallelism) 41 | 42 | // kafka source 43 | val kafkaSource = KafkaSource.builder[String]() 44 | .setBootstrapServers(Common.BROKER_LIST) 45 | .setTopics(topic) 46 | .setGroupId("KafkaSinkTest") 47 | .setStartingOffsets(OffsetsInitializer.latest()) 48 | .setValueOnlyDeserializer(new SimpleStringSchema()) 49 | .build(); 50 | 51 | // kafka sink 52 | val kafkaSink = KafkaSink 53 | .builder[String]() 54 | .setBootstrapServers(bootstrapServer) 55 | .setKafkaProducerConfig(Common.getProp) 56 | .setRecordSerializer(KafkaRecordSerializationSchema.builder[String]() 57 | .setTopic(sinkTopic) 58 | // 不指定 key 的序列号器,key 会为 空 59 | // .setKeySerializationSchema(new SimpleStringSchema()) 60 | .setValueSerializationSchema(new SimpleStringSchema()) 61 | .build() 62 | ) 63 | .build() 64 | 65 | 66 | // add source,读取数据 67 | val sourceStream = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource") 68 | 69 | // map, add current subtask index 70 | val mapStream = sourceStream 71 | // rebalance data to all parallelisn 72 | .rebalance 73 | .flatMap(new RichFlatMapFunction[String, String] { 74 | override def flatMap(element: String, out: Collector[String]): Unit = { 75 | val parallelism = getRuntimeContext.getIndexOfThisSubtask 76 | out.collect(parallelism + "," + element) 77 | 78 | } 79 | }) 80 | .name("flatMap") 81 | .uid("flatMap") 82 | 83 | // sink to kafka, new api 84 | // mapStream.sinkTo(kafkaSink) 85 | 86 | // sink to kafka, old api 87 | val kafkaProducer = new FlinkKafkaProducer[String](bootstrapServer,sinkTopic, new SimpleStringSchema()) 88 | mapStream.addSink(kafkaProducer) 89 | .setParallelism(parallelism) 90 | 91 | env.execute("KafkaSinkTest") 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/cep/ContinueRising.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.cep 2 | // 3 | //import java.util 4 | // 5 | //import org.apache.flink.api.scala._ 6 | //import org.apache.flink.cep.functions.PatternProcessFunction 7 | //import org.apache.flink.cep.pattern.conditions.IterativeCondition 8 | //import org.apache.flink.cep.scala.CEP 9 | //import org.apache.flink.cep.scala.pattern.Pattern 10 | //import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} 11 | //import org.apache.flink.streaming.api.windowing.time.Time 12 | //import org.apache.flink.util.Collector 13 | //import org.slf4j.LoggerFactory 14 | // 15 | ///** 16 | // * Cep for price continue rising 17 | // * CEP : 匹配价格连续上涨(keyby 可以匹配同一个商品价格连续上涨) 18 | // * 19 | // */ 20 | //object ContinueRising { 21 | // val logger = LoggerFactory.getLogger(this.getClass) 22 | // 23 | // def main(args: Array[String]): Unit = { 24 | // 25 | // val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 26 | // 27 | // // 输入 id, volumn, name 三个字段的数据 28 | // val input = env.addSource(new CepDemoSourceFunction) 29 | // .map(str => { 30 | // // logger.info(str) 31 | // val arr = str.split(",") 32 | // val id = arr(0) 33 | // val volume = arr(1).toInt 34 | // val name = arr(2) 35 | // CepDemoEvent(id, volume, name, arr(3).toInt) 36 | // }) 37 | // // Applying your pattern on a non-keyed stream will result in a job with parallelism equal to 1 38 | // // .keyBy(_.id) 39 | // 40 | // /** 41 | // * 模式说明: 42 | // * 匹配价格连续上涨 43 | // * 44 | // * 匹配后跳过策略: 默认从上次的开始事件后的下一个事件开始 45 | // * 46 | // */ 47 | // val pattern = Pattern.begin[CepDemoEvent]("first") 48 | // .next("second").where(new IterativeCondition[CepDemoEvent] { 49 | // override def filter(currentEvent: CepDemoEvent, context: IterativeCondition.Context[CepDemoEvent]): Boolean = { 50 | // // get last event 51 | // val firstList = context.getEventsForPattern("first").iterator() 52 | // var lastStart: CepDemoEvent = null 53 | // // get last from firstList, and get the last one 54 | // while (firstList.hasNext) { 55 | // lastStart = firstList.next() 56 | // } 57 | // if (currentEvent.volume > lastStart.volume) { 58 | // true 59 | // } else { 60 | // false 61 | // } 62 | // } 63 | // }) 64 | // // always remember add within, it will reduce the state usage 65 | // .within(Time.minutes(5 * 60 * 1000)) 66 | // 67 | // val patternStream = CEP.pattern(input, pattern) 68 | // 69 | // val result: DataStream[String] = patternStream.process( 70 | // new PatternProcessFunction[CepDemoEvent, String]() { 71 | // override def processMatch( 72 | // events: util.Map[String, util.List[CepDemoEvent]], 73 | // ctx: PatternProcessFunction.Context, 74 | // out: Collector[String]): Unit = { 75 | // // get the change 76 | // val first = events.get("first").get(0) 77 | // val second = events.get("second").get(0) 78 | // val change = second.volume - first.volume 79 | // out.collect("from : " + first.id + ", to " + second.id + ", change : " + change) 80 | // } 81 | // 82 | // }) 83 | // 84 | // // for convenient, just print 85 | // result.print() 86 | // env.execute(this.getClass.getName) 87 | // } 88 | // 89 | // 90 | //} 91 | // 92 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/source/kafka/KafkaUpsertTableSourceSinkFactory.java: -------------------------------------------------------------------------------- 1 | ///* 2 | // * Licensed to the Apache Software Foundation (ASF) under one or more 3 | // * contributor license agreements. See the NOTICE file distributed with 4 | // * this work for additional information regarding copyright ownership. 5 | // * The ASF licenses this file to You under the Apache License, Version 2.0 6 | // * (the "License"); you may not use this file except in compliance with 7 | // * the License. You may obtain a copy of the License at 8 | // * 9 | // * http://www.apache.org/licenses/LICENSE-2.0 10 | // * 11 | // * Unless required by applicable law or agreed to in writing, software 12 | // * distributed under the License is distributed on an "AS IS" BASIS, 13 | // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // * See the License for the specific language governing permissions and 15 | // * limitations under the License. 16 | // */ 17 | // 18 | //package com.venn.source.kafka; 19 | // 20 | //import org.apache.flink.api.common.serialization.DeserializationSchema; 21 | //import org.apache.flink.api.common.serialization.SerializationSchema; 22 | //import org.apache.flink.streaming.connectors.kafka.KafkaTableSource; 23 | //import org.apache.flink.streaming.connectors.kafka.KafkaTableSourceBase; 24 | //import org.apache.flink.streaming.connectors.kafka.config.StartupMode; 25 | //import org.apache.flink.streaming.connectors.kafka.internals.KafkaTopicPartition; 26 | //import org.apache.flink.streaming.connectors.kafka.partitioner.FlinkKafkaPartitioner; 27 | //import org.apache.flink.table.api.TableSchema; 28 | //import org.apache.flink.table.sources.RowtimeAttributeDescriptor; 29 | //import org.apache.flink.types.Row; 30 | // 31 | //import java.util.List; 32 | //import java.util.Map; 33 | //import java.util.Optional; 34 | //import java.util.Properties; 35 | // 36 | ///** 37 | // * Factory for creating configured instances of {@link KafkaTableSource}. 38 | // */ 39 | //public class KafkaUpsertTableSourceSinkFactory extends KafkaUpsertTableSourceSinkFactoryBase { 40 | // 41 | // @Override 42 | // protected String kafkaVersion() { 43 | // return MyKafkaValidator.CONNECTOR_VERSION_VALUE_UNIVERSAL; 44 | // } 45 | // 46 | // @Override 47 | // protected boolean supportsKafkaTimestamps() { 48 | // return true; 49 | // } 50 | // 51 | // @Override 52 | // protected KafkaTableSourceBase createKafkaTableSource( 53 | // TableSchema schema, 54 | // Optional proctimeAttribute, 55 | // List rowtimeAttributeDescriptors, 56 | // Map fieldMapping, 57 | // String topic, 58 | // Properties properties, 59 | // DeserializationSchema deserializationSchema, 60 | // StartupMode startupMode, 61 | // Map specificStartupOffsets) { 62 | // 63 | // return new KafkaTableSource( 64 | // schema, 65 | // proctimeAttribute, 66 | // rowtimeAttributeDescriptors, 67 | // Optional.of(fieldMapping), 68 | // topic, 69 | // properties, 70 | // deserializationSchema, 71 | // startupMode, 72 | // specificStartupOffsets); 73 | // } 74 | // 75 | // @Override 76 | // protected KafkaUpsertTableSink createKafkaTableSink( 77 | // TableSchema schema, 78 | // String topic, 79 | // Properties properties, 80 | // Optional> partitioner, 81 | // SerializationSchema serializationSchema) { 82 | // 83 | // return new KafkaUpsertTableSink( 84 | // schema, 85 | // topic, 86 | // properties, 87 | // partitioner, 88 | // serializationSchema); 89 | // } 90 | //} 91 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/tryFlink/FraudDetection.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.tryFlink 2 | 3 | import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.api.scala.typeutils.Types 6 | import org.apache.flink.configuration.Configuration 7 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction 8 | import org.apache.flink.streaming.api.functions.sink.SinkFunction 9 | import org.apache.flink.streaming.api.functions.source.SourceFunction 10 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 11 | import org.apache.flink.util.Collector 12 | import org.slf4j.LoggerFactory 13 | 14 | import scala.util.Random 15 | 16 | /** 17 | * source from : flink official website : 基于 DataStream API 实现欺诈检测 18 | */ 19 | object FraudDetection { 20 | 21 | private val LOG = LoggerFactory.getLogger("FraudDetection") 22 | 23 | def main(args: Array[String]): Unit = { 24 | 25 | val env = StreamExecutionEnvironment.getExecutionEnvironment 26 | env.setParallelism(1) 27 | 28 | val source = env.addSource(new FuaudDetectionSource) 29 | .name("source") 30 | 31 | val process = source 32 | .keyBy(_._1) 33 | .process(new FuaudDetectionProcessFunction) 34 | 35 | process.addSink(new SinkFunction[String]{ 36 | override def invoke(element: String, context: SinkFunction.Context): Unit = { 37 | println("fraud detection alter : " + element) 38 | } 39 | } ) 40 | 41 | env.execute("FuaudDetection") 42 | 43 | } 44 | 45 | } 46 | 47 | class FuaudDetectionSource extends SourceFunction[(String, Double)] { 48 | val LOG = LoggerFactory.getLogger("FuaudDetectionSource") 49 | var isRunning = true; 50 | val random = new Random() 51 | 52 | override def run(sourceContext: SourceFunction.SourceContext[(String, Double)]): Unit = { 53 | 54 | while (isRunning) { 55 | val accountId = "" + random.nextInt(1000) 56 | val amt = random.nextDouble() * 100; 57 | 58 | sourceContext.collect(accountId, amt) 59 | 60 | Thread.sleep(1) 61 | } 62 | LOG.info("source finish") 63 | } 64 | 65 | override def cancel(): Unit = { 66 | 67 | LOG.info("source canceled...") 68 | isRunning = false; 69 | } 70 | } 71 | 72 | class FuaudDetectionProcessFunction extends KeyedProcessFunction[String, (String, Double), String] { 73 | 74 | var smallFlag: ValueState[java.lang.Boolean] = _ 75 | 76 | override def open(parameters: Configuration): Unit = { 77 | smallFlag = getRuntimeContext.getState(new ValueStateDescriptor("smallTransaction", Types.BOOLEAN)) 78 | } 79 | 80 | override def processElement(element: (String, Double), context: KeyedProcessFunction[String, (String, Double), String]#Context, collector: Collector[String]): Unit = { 81 | 82 | if(smallFlag.value() != null && smallFlag.value() && element._2 > 95){ 83 | collector.collect(element._1) 84 | } 85 | 86 | if(element._2 < 2){ 87 | smallFlag.update(true) 88 | context.timerService().registerProcessingTimeTimer(System.currentTimeMillis() + 10 * 1000 ) 89 | } 90 | 91 | } 92 | 93 | 94 | override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[String, (String, Double), String]#OnTimerContext, out: Collector[String]): Unit = { 95 | println("cliear key : " + ctx.getCurrentKey) 96 | smallFlag.clear() 97 | } 98 | 99 | override def close(): Unit = { 100 | smallFlag.clear() 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/starrocks/StreamLoadTestV2.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.starrocks 2 | 3 | import com.starrocks.connector.flink.StarRocksSink 4 | import com.starrocks.connector.flink.table.sink.StarRocksSinkOptions 5 | import org.apache.flink.api.common.functions.RichMapFunction 6 | import org.apache.flink.api.scala._ 7 | import org.apache.flink.configuration.Configuration 8 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 9 | import org.slf4j.LoggerFactory 10 | 11 | import scala.util.Random 12 | 13 | object StreamLoadTestV2 { 14 | 15 | val LOG = LoggerFactory.getLogger("StreamLoadTest") 16 | val COL_SEP = "\\\\x01"; 17 | val ROW_SEP = "\\\\x02"; 18 | val ip = "10.201.0.230" 19 | val jdbcPort = "29030" 20 | val httpPort = "28030" 21 | val user = "root" 22 | val pass = "123456" 23 | val sql = "select * from test.t_starrocks_load_error limit 2000" 24 | var batch = 1000 25 | var interval = 5 26 | 27 | def main(args: Array[String]): Unit = { 28 | 29 | if (args.length >= 2) { 30 | batch = Integer.parseInt(args(0)) 31 | interval = Integer.parseInt(args(1)) 32 | } 33 | 34 | 35 | val env = StreamExecutionEnvironment.getExecutionEnvironment 36 | env.setParallelism(1) 37 | 38 | 39 | 40 | val source = env.addSource(new CustJdbcSource(ip, jdbcPort, user, pass, sql, COL_SEP, batch, interval)) 41 | 42 | val stream = source.map(new RichMapFunction[String, String] { 43 | 44 | var random: Random = _ 45 | 46 | override def open(parameters: Configuration): Unit = { 47 | random = new Random(); 48 | 49 | } 50 | 51 | override def map(element: String): String = { 52 | 53 | val index = element.indexOf(COL_SEP) 54 | 55 | val prex = element.substring(0, index) 56 | val subx = element.substring(index) 57 | 58 | var newPrex = 0l 59 | try { 60 | newPrex = prex.toLong / (random.nextInt(10000) + 1) 61 | } catch { 62 | case ex: java.lang.ArithmeticException => 63 | newPrex = random.nextLong() 64 | ex.printStackTrace() 65 | LOG.info("prex : {}", prex) 66 | 67 | case _ => 68 | 69 | } 70 | 71 | newPrex + subx 72 | } 73 | }) 74 | 75 | val sink = StarRocksSink.sink( 76 | // the sink options 77 | StarRocksSinkOptions.builder() 78 | .withProperty("jdbc-url", "jdbc:mysql://" + ip + ":" + jdbcPort) 79 | .withProperty("load-url", ip + ":" + httpPort) 80 | .withProperty("username", user) 81 | .withProperty("password", pass) 82 | .withProperty("database-name", "test") 83 | .withProperty("table-name", "t_starrocks_load_error_3") 84 | // 自 2.4 版本,支持更新主键模型中的部分列。您可以通过以下两个属性指定需要更新的列。 85 | // .withProperty("sink.properties.partial_update", "true") 86 | // .withProperty("sink.properties.columns", "k1,k2,k3") 87 | // .withProperty("sink.properties.format", "json") 88 | // .withProperty("sink.properties.strip_outer_array", "true") 89 | .withProperty("sink.properties.row_delimiter", ROW_SEP) 90 | .withProperty("sink.properties.column_separator", COL_SEP) 91 | // 设置并行度,多并行度情况下需要考虑如何保证数据有序性 92 | .withProperty("sink.parallelism", "1") 93 | .withProperty("sink.buffer-flush.max-rows", "" + batch) 94 | .build()) 95 | 96 | stream.addSink(sink) 97 | .uid("sink") 98 | .name("sink") 99 | 100 | env.execute("StreamLoadTest") 101 | 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/venn/demo/AsyncRedisFunction.java: -------------------------------------------------------------------------------- 1 | package com.venn.demo; 2 | 3 | import com.google.gson.JsonParser; 4 | import io.lettuce.core.RedisClient; 5 | import io.lettuce.core.RedisFuture; 6 | import io.lettuce.core.api.StatefulRedisConnection; 7 | import io.lettuce.core.api.async.RedisAsyncCommands; 8 | import org.apache.flink.configuration.Configuration; 9 | import org.apache.flink.streaming.api.functions.async.ResultFuture; 10 | import org.apache.flink.streaming.api.functions.async.RichAsyncFunction; 11 | 12 | import java.util.Collections; 13 | import java.util.concurrent.CompletableFuture; 14 | import java.util.concurrent.ExecutionException; 15 | import java.util.function.Consumer; 16 | 17 | /** 18 | * async redis function 19 | */ 20 | public class AsyncRedisFunction extends RichAsyncFunction { 21 | private RedisAsyncCommands async; 22 | private String url; 23 | private StatefulRedisConnection connection; 24 | private RedisClient redisClient; 25 | private JsonParser jsonParser; 26 | 27 | public AsyncRedisFunction(String url) { 28 | this.url = url; 29 | } 30 | 31 | @Override 32 | public void open(Configuration parameters) throws Exception { 33 | // redis standalone 34 | redisClient = RedisClient.create(url); 35 | connection = redisClient.connect(); 36 | 37 | // redis cluster 38 | // List uriList = new ArrayList<>(); 39 | // for (String tmp : url.split(",")) { 40 | // String[] str = tmp.split(":"); 41 | // String host = str[0]; 42 | // int port = Integer.parseInt(str[1]); 43 | // RedisURI redisUri = RedisURI.Builder.redis(host).withPort(port).build(); 44 | // uriList.add(redisUri); 45 | // } 46 | // RedisClusterClient redisClient = redisClusterClient.create(uriList); 47 | // connection = redisClient.connect(); 48 | 49 | // async 50 | async = connection.async(); 51 | 52 | jsonParser = new JsonParser(); 53 | } 54 | 55 | 56 | //数据处理的方法 57 | @Override 58 | public void asyncInvoke(String input, ResultFuture resultFuture) throws Exception { 59 | 60 | String userId = jsonParser.parse(input).getAsJsonObject().get("user_id").getAsString(); 61 | // query string 62 | RedisFuture redisFuture = async.get(userId); 63 | // query hash 64 | // RedisFuture redisFuture = async.hget("key", input); 65 | // get all 66 | // async.hgetall(input); 67 | 68 | // async query and get result 69 | CompletableFuture.supplyAsync(() -> { 70 | try { 71 | return redisFuture.get(); 72 | } catch (InterruptedException e) { 73 | e.printStackTrace(); 74 | } catch (ExecutionException e) { 75 | e.printStackTrace(); 76 | } 77 | // if get exception 78 | return "exception"; 79 | }).thenAccept(new Consumer() { 80 | @Override 81 | public void accept(String result) { 82 | if (result == null) { 83 | result = "nothing"; 84 | } 85 | // return result 86 | resultFuture.complete(Collections.singleton(input + " - " + result)); 87 | } 88 | }); 89 | } 90 | 91 | @Override 92 | public void close() throws Exception { 93 | super.close(); 94 | if (connection != null) { 95 | connection.close(); 96 | } 97 | if (redisClient != null) { 98 | redisClient.shutdown(); 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/intervalJoin/IntervalJoinDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.intervalJoin 2 | 3 | import java.io.File 4 | import java.text.SimpleDateFormat 5 | 6 | import com.venn.common.Common 7 | import com.venn.source.TumblingEventTimeWindows 8 | import com.venn.util.CheckpointUtil 9 | import org.apache.flink.api.common.functions.ReduceFunction 10 | import org.apache.flink.formats.json.JsonNodeDeserializationSchema 11 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 12 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 13 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 14 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 15 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 16 | import org.apache.flink.api.scala._ 17 | import org.apache.flink.streaming.api.windowing.time.Time 18 | 19 | /** 20 | * interval join demo 21 | */ 22 | object IntervalJoinDemo { 23 | 24 | def main(args: Array[String]): Unit = { 25 | 26 | val env = StreamExecutionEnvironment.getExecutionEnvironment 27 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 28 | // if ("/".equals(File.separator)) { 29 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 30 | // env.setStateBackend(backend) 31 | // env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 32 | // } else { 33 | // env.setMaxParallelism(1) 34 | // env.setParallelism(1) 35 | // } 36 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 37 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 38 | 39 | val sdf = new SimpleDateFormat("yyyyMMddHHmmss") 40 | val sourceLeft = new FlinkKafkaConsumer[ObjectNode]("topic_left", new JsonNodeDeserializationSchema, Common.getProp) 41 | val sourceRight = new FlinkKafkaConsumer[ObjectNode]("topic_right", new JsonNodeDeserializationSchema, Common.getProp) 42 | 43 | sourceLeft.setStartFromLatest() 44 | sourceRight.setStartFromLatest() 45 | 46 | // transfer left stream json to AsyncUser 47 | val leftStream = env.addSource(sourceLeft) 48 | .map(json => { 49 | val id = json.get("id").asText() 50 | val name = json.get("name").asText() 51 | val date = json.get("date").asText() 52 | IntervalUser(id, name, null, date) 53 | }) 54 | .assignAscendingTimestamps(u => sdf.parse(u.date).getTime) 55 | .keyBy(0) 56 | // transfer right stream json to AsyncUser 57 | val rightStream = env.addSource(sourceRight) 58 | .map(json => { 59 | val id = json.get("id").asText() 60 | val phone = json.get("phone").asText() 61 | val date = json.get("date").asText() 62 | IntervalUser(id, null, phone, date) 63 | }) 64 | .assignAscendingTimestamps(u => sdf.parse(u.date).getTime) 65 | .keyBy(0) 66 | 67 | // join it 68 | /* 69 | 左边为主,两边都可以触发,触发范围: 70 | a.timestamp + lowerBound <= b.timestamp <= a.timestamp + upperBound 71 | 72 | */ 73 | leftStream 74 | .intervalJoin(rightStream) 75 | .between(Time.seconds(-2), Time.seconds(7)) 76 | //.lowerBoundExclusive() // 排除下界 77 | // .upperBoundExclusive() // 排除上界 78 | .process(new IntervalJoinProcessFunctionDemo) 79 | /*.assignAscendingTimestamps(_.phone.toLong) 80 | .keyBy("id") 81 | .window(TumblingEventTimeWindows.of(Time.milliseconds(10))) 82 | .min("id")*/ 83 | /*.reduce(new ReduceFunction[IntervalUser] { 84 | override def reduce(value1: IntervalUser, value2: IntervalUser): IntervalUser = { 85 | println("xx -> " + value2) 86 | value2 87 | } 88 | })*/ 89 | 90 | .print("result -> ") 91 | 92 | env.execute("IntervalJoinDemo") 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/cep/AfterMatchStrategyDemo.scala: -------------------------------------------------------------------------------- 1 | //package com.venn.cep 2 | // 3 | //import java.util 4 | // 5 | //import com.venn.common.Common 6 | //import org.apache.flink.api.common.serialization.SimpleStringSchema 7 | //import org.apache.flink.api.scala._ 8 | //import org.apache.flink.cep.functions.PatternProcessFunction 9 | //import org.apache.flink.cep.nfa.aftermatch.AfterMatchSkipStrategy 10 | //import org.apache.flink.cep.pattern.conditions.IterativeCondition 11 | //import org.apache.flink.cep.scala.CEP 12 | //import org.apache.flink.cep.scala.pattern.Pattern 13 | //import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} 14 | //import org.apache.flink.streaming.api.windowing.time.Time 15 | //import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 16 | //import org.apache.flink.util.Collector 17 | //import org.slf4j.LoggerFactory 18 | // 19 | ///** 20 | // * Cep for after match strategy 21 | // * CEP : 模式匹配后的跳过策略测试: 22 | // * 23 | // * NO_SKIP: 24 | // * SKIP_TO_NEXT: 25 | // * SKIP_PAST_LAST_EVENT: 26 | // * SKIP_TO_FIRST[b]: 27 | // * SKIP_TO_LAST[b]: 28 | // * 29 | // * Command : 30 | // * 31 | // */ 32 | //object AfterMatchStrategyDemo { 33 | // val logger = LoggerFactory.getLogger(this.getClass) 34 | // 35 | // def main(args: Array[String]): Unit = { 36 | // 37 | // val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 38 | // 39 | // env.setParallelism(1) 40 | // val topic = "match_strategy" 41 | // val source = new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), Common.getProp) 42 | // 43 | // val input = env.addSource(source) 44 | // .map(str => { 45 | // // logger.info(str) 46 | // val arr = str.split(",") 47 | // val id = arr(0) 48 | // val name = arr(1) 49 | // CepDemoEvent(id, 0, name, 0) 50 | // }).setParallelism(1) 51 | // // Applying your pattern on a non-keyed stream will result in a job with parallelism equal to 1 52 | // // .keyBy(_.id) 53 | // 54 | // /** 55 | // * 模式说明: 56 | // * 匹配价格连续上涨 57 | // * 58 | // * 匹配后跳过策略: 默认从上次的开始事件后的下一个事件开始 59 | // * 60 | // * NO_SKIP:default 61 | // * SKIP_TO_NEXT: 62 | // * SKIP_PAST_LAST_EVENT: 63 | // * SKIP_TO_FIRST[b]: 64 | // * SKIP_TO_LAST[b]: 65 | // * 66 | // */ 67 | // val noSkit = AfterMatchSkipStrategy.noSkip() 68 | // val pattern = Pattern.begin[CepDemoEvent]("first").where(event => { 69 | // event.name.equals("a") 70 | // }) 71 | // // .timesOrMore(1) 72 | // .next("second").where(event => { 73 | // event.name.equals("a") 74 | // }) 75 | // .next("third").where(event => { 76 | // event.name.equals("b") 77 | // }) 78 | //// .notNext() 79 | // 80 | // // always remember add within, it will reduce the state usage 81 | // // .within(Time.minutes(5 * 60 * 1000)) 82 | // 83 | // val patternStream = CEP.pattern(input, pattern) 84 | // 85 | // val result: DataStream[String] = patternStream.process( 86 | // new PatternProcessFunction[CepDemoEvent, String]() { 87 | // override def processMatch( 88 | // events: util.Map[String, util.List[CepDemoEvent]], 89 | // ctx: PatternProcessFunction.Context, 90 | // out: Collector[String]): Unit = { 91 | // // get the change 92 | // val first = events.get("first").get(0) 93 | // val second = events.get("second").get(0) 94 | // val third = events.get("third").get(0) 95 | // out.collect("first : " + first + ", first " + second + ", third : " + third) 96 | // } 97 | // 98 | // }) 99 | // 100 | // // for convenient, just print 101 | // result.print() 102 | // env.execute(this.getClass.getName) 103 | // } 104 | // 105 | // 106 | //} 107 | // 108 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/connector/starrocks/StreamLoadTest.scala: -------------------------------------------------------------------------------- 1 | package com.venn.connector.starrocks 2 | 3 | import com.starrocks.connector.flink.StarRocksSink 4 | import com.starrocks.connector.flink.table.sink.StarRocksSinkOptions 5 | import org.apache.flink.api.common.functions.RichMapFunction 6 | import org.apache.flink.api.common.restartstrategy.RestartStrategies 7 | import org.apache.flink.api.common.restartstrategy.RestartStrategies.FixedDelayRestartStrategyConfiguration 8 | import org.apache.flink.api.common.time.Time 9 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 10 | import org.apache.flink.api.scala._ 11 | import org.apache.flink.configuration.{Configuration, RestartStrategyOptions} 12 | import org.slf4j.LoggerFactory 13 | 14 | import scala.util.Random 15 | 16 | object StreamLoadTest { 17 | 18 | val LOG = LoggerFactory.getLogger("StreamLoadTest") 19 | // val COL_SEP = "\\\\x01"; 20 | // val ROW_SEP = "\\\\x02"; 21 | val COL_SEP = "," 22 | val ROW_SEP = "\\n" 23 | val ip = "10.201.0.230" 24 | val jdbcPort = "29030" 25 | val httpPort = "28030" 26 | val user = "root" 27 | val pass = "123456" 28 | val sql = "select * from test.t_starrocks_load_error limit 1000" 29 | var batch = 1000 30 | var interval = 5 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | if (args.length >= 2) { 35 | batch = Integer.parseInt(args(0)) 36 | interval = Integer.parseInt(args(1)) 37 | } 38 | 39 | 40 | val env = StreamExecutionEnvironment.getExecutionEnvironment 41 | env.setParallelism(1) 42 | env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, Time.seconds(20))) 43 | 44 | val source = env.addSource(new CustJdbcSource(ip, jdbcPort, user, pass, sql, COL_SEP, batch, interval)) 45 | 46 | val stream = source.map(new RichMapFunction[String, String] { 47 | 48 | var random: Random = _ 49 | 50 | override def open(parameters: Configuration): Unit = { 51 | random = new Random(); 52 | 53 | } 54 | 55 | override def map(element: String): String = { 56 | 57 | val index = element.indexOf(COL_SEP) 58 | 59 | val prex = element.substring(0, index) 60 | val subx = element.substring(index) 61 | 62 | var newPrex = 0l 63 | try { 64 | newPrex = prex.toLong / (random.nextInt(10000) + 1) 65 | } catch { 66 | case ex: java.lang.ArithmeticException => 67 | newPrex = random.nextLong() 68 | ex.printStackTrace() 69 | LOG.info("prex : {}", prex) 70 | 71 | case _ => 72 | 73 | } 74 | 75 | newPrex + subx 76 | } 77 | }) 78 | 79 | val sink = StarRocksSink.sink( 80 | // the sink options 81 | StarRocksSinkOptions.builder() 82 | .withProperty("jdbc-url", "jdbc:mysql://" + ip + ":" + jdbcPort) 83 | .withProperty("load-url", ip + ":" + httpPort) 84 | .withProperty("username", user) 85 | .withProperty("password", pass) 86 | .withProperty("database-name", "test") 87 | .withProperty("table-name", "t_starrocks_load_error_3") 88 | // 自 2.4 版本,支持更新主键模型中的部分列。您可以通过以下两个属性指定需要更新的列。 89 | // .withProperty("sink.properties.partial_update", "true") 90 | // .withProperty("sink.properties.columns", "k1,k2,k3") 91 | // .withProperty("sink.properties.format", "json") 92 | // .withProperty("sink.properties.strip_outer_array", "true") 93 | .withProperty("sink.properties.row_delimiter", ROW_SEP) 94 | .withProperty("sink.properties.column_separator", COL_SEP) 95 | // 设置并行度,多并行度情况下需要考虑如何保证数据有序性 96 | .withProperty("sink.parallelism", "1") 97 | .withProperty("sink.version", "v1") 98 | .withProperty("sink.buffer-flush.max-rows", "" + batch) 99 | .build()) 100 | 101 | stream.addSink(sink) 102 | .uid("sink") 103 | .name("sink") 104 | 105 | env.execute("StreamLoadTest") 106 | 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/com/venn/demo/TypeTest.java: -------------------------------------------------------------------------------- 1 | package com.venn.demo; 2 | 3 | import com.venn.common.Common; 4 | import com.venn.util.DateTimeUtil; 5 | import org.apache.flink.api.common.RuntimeExecutionMode; 6 | import org.apache.flink.api.common.eventtime.WatermarkStrategy; 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema; 8 | import org.apache.flink.api.common.typeinfo.TypeInformation; 9 | import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema; 10 | import org.apache.flink.connector.kafka.sink.KafkaSink; 11 | import org.apache.flink.connector.kafka.source.KafkaSource; 12 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer; 13 | import org.apache.flink.connector.kafka.source.reader.deserializer.KafkaRecordDeserializationSchema; 14 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; 15 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 16 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows; 17 | import org.apache.flink.streaming.api.windowing.time.Time; 18 | import org.apache.flink.util.Collector; 19 | import org.apache.kafka.clients.consumer.ConsumerRecord; 20 | 21 | import java.io.IOException; 22 | 23 | public class TypeTest { 24 | 25 | public static void main(String[] args) throws Exception { 26 | 27 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 28 | 29 | // env.setRuntimeMode(RuntimeExecutionMode.BATCH); 30 | 31 | env.setParallelism(1); 32 | String bootstrapServer = "localhost:9092"; 33 | String topic = "user_log"; 34 | // source 35 | KafkaSource kafkaSource = KafkaSource 36 | .builder() 37 | .setBootstrapServers(bootstrapServer) 38 | .setGroupId("ra") 39 | .setTopics(topic) 40 | .setBounded(OffsetsInitializer.timestamp(DateTimeUtil.parse("2022-04-29 12:00:00").getTime())) 41 | // .setUnbounded(OffsetsInitializer.latest()) 42 | .setStartingOffsets(OffsetsInitializer.earliest()) 43 | .setDeserializer(new KafkaRecordDeserializationSchema() { 44 | @Override 45 | public TypeInformation getProducedType() { 46 | return null; 47 | } 48 | @Override 49 | public void deserialize(ConsumerRecord record, Collector out) throws IOException { 50 | byte[] value = (byte[])record.value(); 51 | 52 | out.collect(new String(value)); 53 | } 54 | }) 55 | .build(); 56 | 57 | 58 | SingleOutputStreamOperator source = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource") 59 | .returns(String.class); 60 | 61 | SingleOutputStreamOperator stream = source.map(aa -> aa) 62 | .returns(String.class) 63 | .map(aa -> 1) 64 | .returns(Integer.class) 65 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(10))) 66 | .sum(0) 67 | .map(aa -> "" + aa) 68 | .returns(String.class); 69 | 70 | KafkaSink sink = KafkaSink 71 | .builder() 72 | .setBootstrapServers(bootstrapServer) 73 | .setKafkaProducerConfig(Common.getProp()) 74 | .setRecordSerializer(KafkaRecordSerializationSchema.builder() 75 | .setTopic(topic + "_sink") 76 | .setKeySerializationSchema(new SimpleStringSchema()) 77 | .setValueSerializationSchema(new SimpleStringSchema()) 78 | .build() 79 | ) 80 | .setTransactionalIdPrefix("xxx" + System.currentTimeMillis()) 81 | .build(); 82 | 83 | stream.sinkTo(sink); 84 | 85 | 86 | env.execute("typeTest"); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/timer/CustomerTimerDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.timer 2 | 3 | import java.io.File 4 | import java.sql.{Connection, DriverManager, PreparedStatement, SQLException} 5 | import java.util 6 | import java.util.{Timer, TimerTask} 7 | 8 | import org.apache.flink.api.scala._ 9 | import com.venn.common.Common 10 | import com.venn.util.{CheckpointUtil, TwoStringSource} 11 | import org.apache.flink.api.common.functions.RichMapFunction 12 | import org.apache.flink.api.common.serialization.SimpleStringSchema 13 | import org.apache.flink.configuration.Configuration 14 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 15 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 16 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 17 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer 18 | import org.slf4j.LoggerFactory 19 | 20 | /** 21 | * 在 open 方法中使用 定时器,定时加载外部数据,比如mysql 22 | * 业务假设: ETL的时候,数据进来,需要补充外部系统的数据,外部系统的数据会更新,所有不能一次性加载就不管了 23 | * 又有大部分数据是不会更新的(或者更新只是偶尔的),如果使用异步io 感觉很浪费 24 | * 这时候就可以考虑,使用timer,定时加载 25 | * 26 | * 27 | * 在map 中连接,添加定时器,定时从mysql 加载数据 28 | */ 29 | object CustomerTimerDemo { 30 | private final val logger = LoggerFactory.getLogger(CustomerTimerDemo.getClass) 31 | 32 | def main(args: Array[String]): Unit = { 33 | 34 | 35 | val env = StreamExecutionEnvironment.getExecutionEnvironment 36 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 37 | if ("/".equals(File.separator)) { 38 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 39 | // env.setStateBackend(backend) 40 | // env.enableCheckpointing(30 * 60 * 1000, CheckpointingMode.EXACTLY_ONCE) 41 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 42 | } else { 43 | env.setMaxParallelism(1) 44 | env.setParallelism(1) 45 | } 46 | 47 | // 自定义的source,输出 x,xxx 格式随机字符 48 | val input = env.addSource(new TwoStringSource) 49 | val stream = input.map(new RichMapFunction[String, String] { 50 | 51 | val jdbcUrl = "jdbc:mysql://venn:3306?useSSL=false&allowPublicKeyRetrieval=true" 52 | val username = "root" 53 | val password = "123456" 54 | val driverName = "com.mysql.jdbc.Driver" 55 | var conn: Connection = null 56 | var ps: PreparedStatement = null 57 | val map = new util.HashMap[String, String]() 58 | 59 | override def open(parameters: Configuration): Unit = { 60 | logger.info("init....") 61 | query() 62 | // new Timer 63 | val timer = new Timer(true) 64 | // schedule is 10 second, 1 second between successive task executions 65 | timer.schedule(new TimerTask { 66 | override def run(): Unit = { 67 | query() 68 | } 69 | }, 10000) 70 | 71 | } 72 | 73 | override def map(value: String): String = { 74 | // concat input and mysql data 75 | value + "-" + map.get(value.split(",")(0)) 76 | } 77 | 78 | /** 79 | * query mysql for get new config data 80 | */ 81 | def query() = { 82 | logger.info("query mysql") 83 | try { 84 | Class.forName(driverName) 85 | conn = DriverManager.getConnection(jdbcUrl, username, password) 86 | ps = conn.prepareStatement("select id,name from venn.timer") 87 | val rs = ps.executeQuery 88 | 89 | while (!rs.isClosed && rs.next) { 90 | val id = rs.getString(1) 91 | val name = rs.getString(2) 92 | map.put(id, name) 93 | } 94 | logger.info("get config from db size : {}", map.size()) 95 | 96 | } catch { 97 | case e@(_: ClassNotFoundException | _: SQLException) => 98 | e.printStackTrace() 99 | } finally { 100 | if (conn != null) { 101 | conn.close() 102 | } 103 | } 104 | } 105 | }) 106 | // .print() 107 | 108 | 109 | val sink = new FlinkKafkaProducer[String]("timer_out" 110 | , new SimpleStringSchema() 111 | , Common.getProp) 112 | stream.addSink(sink) 113 | env.execute(this.getClass.getName) 114 | 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/broadcast/BroadCastDemo.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.broadcast 2 | 3 | import java.io.File 4 | 5 | import com.venn.common.Common 6 | import com.venn.util.{CheckpointUtil, StringUtil} 7 | import org.apache.flink.api.common.serialization.SimpleStringSchema 8 | import org.apache.flink.api.common.state.MapStateDescriptor 9 | import org.apache.flink.api.common.typeinfo.BasicTypeInfo 10 | import org.apache.flink.api.scala._ 11 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 12 | import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction 13 | import org.apache.flink.streaming.api.functions.source.SourceFunction 14 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 15 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 16 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 17 | import org.apache.flink.util.Collector 18 | 19 | /** 20 | * broadcast 21 | */ 22 | object BroadCastDemo { 23 | 24 | def main(args: Array[String]): Unit = { 25 | val env = StreamExecutionEnvironment.getExecutionEnvironment 26 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 27 | // if ("/".equals(File.separator)) { 28 | // val backend = new FsStateBackend(Common.CHECK_POINT_DATA_DIR, true) 29 | // env.setStateBackend(backend) 30 | // env.enableCheckpointing(10 * 1000, CheckpointingMode.EXACTLY_ONCE) 31 | // } else { 32 | // env.setMaxParallelism(1) 33 | // env.setParallelism(1) 34 | // } 35 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 36 | 37 | // 配置更新流 38 | val configSource = new FlinkKafkaConsumer[String]("broad_cast_demo", new SimpleStringSchema, Common.getProp) 39 | // 配置流的初始化,可以通过读取配置文件实现 40 | var initFilePath = "" 41 | if ("/".equals(File.separator)) { 42 | initFilePath = "hdfs:///venn/init_file.txt" 43 | } else { 44 | initFilePath = "D:\\idea_out\\broad_cast.txt" 45 | } 46 | val init = env.readTextFile(initFilePath) 47 | val descriptor = new MapStateDescriptor[String, String]("dynamicConfig", BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO) 48 | val configStream = env.addSource(configSource).union(init).broadcast(descriptor) 49 | 50 | 51 | val input = env.addSource(new RadomFunction) 52 | .connect(configStream) 53 | .process(new BroadcastProcessFunction[String, String, String] { 54 | override def processBroadcastElement(value: String, ctx: BroadcastProcessFunction[String, String, String]#Context, out: Collector[String]): Unit = { 55 | 56 | println("new config : " + value) 57 | val configMap = ctx.getBroadcastState(descriptor) 58 | // process update configMap,读取配置数据,写入广播状态中 59 | val line = value.split(",") 60 | configMap.put(line(0), line(1)) 61 | } 62 | 63 | override def processElement(value: String, ctx: BroadcastProcessFunction[String, String, String]#ReadOnlyContext, out: Collector[String]): Unit = { 64 | // use give key, return value 65 | val configMap = ctx.getBroadcastState(descriptor) 66 | // 解析三位城市编码,根据广播状态对应的map,转码为城市对应中文 67 | // println(value) 68 | val line = value.split(",") 69 | val code = line(0) 70 | var va = configMap.get(code) 71 | // 不能转码的数据默认输出 中国(code=xxx) 72 | if (va == null) { 73 | va = "中国(code=" + code + ")"; 74 | } else { 75 | va = va + "(code=" + code + ")" 76 | } 77 | out.collect(va + "," + line(1)) 78 | } 79 | }) 80 | input.print() 81 | 82 | env.execute("BroadCastDemo") 83 | } 84 | } 85 | 86 | class RadomFunction extends SourceFunction[String] { 87 | var flag = true 88 | 89 | override def cancel(): Unit = { 90 | flag = false 91 | } 92 | 93 | override def run(ctx: SourceFunction.SourceContext[String]): Unit = { 94 | while (flag) { 95 | for (i <- 0 to 300) { 96 | var nu = i.toString 97 | while (nu.length < 3) { 98 | nu = "0" + nu 99 | } 100 | ctx.collect(nu + "," + StringUtil.getRandomString(5)) 101 | Thread.sleep(2000) 102 | } 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/cdcStarrocks/CdcStarProcessFunction.java: -------------------------------------------------------------------------------- 1 | package com.venn.question.cdcStarrocks; 2 | 3 | import org.apache.flink.api.common.state.ListState; 4 | import org.apache.flink.api.common.state.ListStateDescriptor; 5 | import org.apache.flink.api.common.state.ValueState; 6 | import org.apache.flink.api.common.state.ValueStateDescriptor; 7 | import org.apache.flink.api.common.typeinfo.TypeInformation; 8 | import org.apache.flink.configuration.Configuration; 9 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction; 10 | import org.apache.flink.util.Collector; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.util.ArrayList; 15 | import java.util.Iterator; 16 | import java.util.List; 17 | 18 | public class CdcStarProcessFunction extends KeyedProcessFunction> { 19 | 20 | private final static Logger LOG = LoggerFactory.getLogger(CdcStarProcessFunction.class); 21 | private int batchSize; 22 | private long batchInterval; 23 | // next timer time 24 | private ValueState cacheTimer; 25 | // current cache size 26 | private ValueState cacheSize; 27 | // cache data 28 | private ListState cache; 29 | 30 | public CdcStarProcessFunction(int batchSize, long batchInterval) { 31 | this.batchSize = batchSize; 32 | this.batchInterval = batchInterval; 33 | } 34 | 35 | @Override 36 | public void open(Configuration parameters) throws Exception { 37 | 38 | ListStateDescriptor cacheDescriptor = new ListStateDescriptor("cache", TypeInformation.of(CdcRecord.class)); 39 | cache = getRuntimeContext().getListState(cacheDescriptor); 40 | 41 | ValueStateDescriptor cacheSizeDescriptor = new ValueStateDescriptor("cacheSize", Integer.class); 42 | cacheSize = getRuntimeContext().getState(cacheSizeDescriptor); 43 | 44 | ValueStateDescriptor cacheTimerDescriptor = new ValueStateDescriptor("cacheTimer", Long.class); 45 | cacheTimer = getRuntimeContext().getState(cacheTimerDescriptor); 46 | } 47 | 48 | @Override 49 | public void processElement(CdcRecord element, KeyedProcessFunction>.Context ctx, Collector> out) throws Exception { 50 | 51 | // cache size + 1 52 | if (cacheSize.value() != null) { 53 | cacheSize.update(cacheSize.value() + 1); 54 | } else { 55 | cacheSize.update(1); 56 | // add timer for interval flush 57 | long nextTimer = System.currentTimeMillis() + batchInterval; 58 | LOG.debug("register timer : {} , key : {}", nextTimer, ctx.getCurrentKey()); 59 | cacheTimer.update(nextTimer); 60 | ctx.timerService().registerProcessingTimeTimer(nextTimer); 61 | } 62 | // add data to cache state 63 | cache.add(element); 64 | // cache size max than batch Size 65 | if (cacheSize.value() >= batchSize) { 66 | // remove next timer 67 | long nextTimer = cacheTimer.value(); 68 | LOG.debug("{} remove timer, key : {}", nextTimer, ctx.getCurrentKey()); 69 | ctx.timerService().deleteProcessingTimeTimer(nextTimer); 70 | // flush data to down stream 71 | flushData(out); 72 | } 73 | } 74 | 75 | /** 76 | * flush data to down stream 77 | */ 78 | private void flushData(Collector> out) throws Exception { 79 | List tmpCache = new ArrayList<>(); 80 | Iterator it = cache.get().iterator(); 81 | while (it.hasNext()) { 82 | tmpCache.add(it.next()); 83 | } 84 | if (tmpCache.size() > 0) { 85 | out.collect(tmpCache); 86 | 87 | // finish flush all cache data, clear state 88 | cache.clear(); 89 | cacheSize.clear(); 90 | cacheTimer.clear(); 91 | } 92 | } 93 | 94 | @Override 95 | public void onTimer(long timestamp, KeyedProcessFunction>.OnTimerContext ctx, Collector> out) throws Exception { 96 | LOG.info("{} trigger timer to flush data", ctx.getCurrentKey(), timestamp); 97 | // batch interval trigger flush data 98 | flushData(out); 99 | } 100 | 101 | @Override 102 | public void close() throws Exception { 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/question/late1mtps/LateTps.scala: -------------------------------------------------------------------------------- 1 | package com.venn.question.late1mtps 2 | 3 | import com.google.gson.JsonParser 4 | import com.venn.entity.KafkaSimpleStringRecord 5 | import com.venn.source.TumblingEventTimeWindows 6 | import com.venn.util.{DateTimeUtil, SimpleKafkaRecordDeserializationSchema} 7 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy} 8 | import org.apache.flink.api.common.functions.RichMapFunction 9 | import org.apache.flink.api.common.serialization.SimpleStringSchema 10 | import org.apache.flink.api.scala._ 11 | import org.apache.flink.configuration.Configuration 12 | import org.apache.flink.connector.kafka.sink.{KafkaRecordSerializationSchema, KafkaSink} 13 | import org.apache.flink.connector.kafka.source.KafkaSource 14 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 15 | import org.apache.flink.streaming.api.scala.{OutputTag, StreamExecutionEnvironment} 16 | import org.apache.flink.streaming.api.windowing.time.Time 17 | 18 | import java.time.Duration 19 | 20 | object LateTps { 21 | 22 | def main(args: Array[String]): Unit = { 23 | 24 | val env = StreamExecutionEnvironment.getExecutionEnvironment 25 | env.setParallelism(1) 26 | 27 | val topic = "user_log" 28 | val bootstrapServer = "localhost:9092" 29 | // window size second 30 | val windowSize: Int = 10 * 60 31 | // calculate tps interval 32 | val intervalSize: Int = 10 33 | 34 | // kafka source for read data 35 | val kafkaSource = KafkaSource 36 | .builder[KafkaSimpleStringRecord]() 37 | .setTopics(topic) 38 | .setBootstrapServers(bootstrapServer) 39 | .setGroupId("late_tps") 40 | .setStartingOffsets(OffsetsInitializer.latest()) 41 | .setDeserializer(new SimpleKafkaRecordDeserializationSchema()) 42 | .build() 43 | 44 | // add source 45 | val source = env 46 | .fromSource(kafkaSource, WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(5)), "kafkaSource") 47 | 48 | // parse data, only get (user_id, ts) 49 | val stream = source 50 | .map(new RichMapFunction[KafkaSimpleStringRecord, (String, Long)] { 51 | var jsonParse: JsonParser = _ 52 | override def open(parameters: Configuration): Unit = { 53 | jsonParse = new JsonParser 54 | } 55 | override def map(element: KafkaSimpleStringRecord): (String, Long) = { 56 | 57 | val json = jsonParse.parse(element.getValue).getAsJsonObject 58 | val tsStr = json.get("ts").getAsString 59 | val ts = DateTimeUtil.parse(tsStr).getTime 60 | val userId = json.get("user_id").getAsString 61 | 62 | (userId, ts) 63 | } 64 | override def close(): Unit = { 65 | jsonParse = null 66 | 67 | } 68 | }) 69 | // set timestamp and watermark 70 | .assignTimestampsAndWatermarks(WatermarkStrategy 71 | .forBoundedOutOfOrderness[(String, Long)](Duration.ofSeconds(5)) 72 | .withTimestampAssigner(new SerializableTimestampAssigner[(String, Long)] { 73 | override def extractTimestamp(t: (String, Long), l: Long): Long = { 74 | t._2 75 | } 76 | }) 77 | // idle 1 minute 78 | .withIdleness(Duration.ofMinutes(1)) 79 | ) 80 | 81 | 82 | // windowSize 10 minute, export every 1 minute tps 83 | val process10m = stream 84 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize))) 85 | .process(new FixedLateTpsProcessAllWindowFunction(windowSize, 60)) 86 | .print("10m") 87 | 88 | // // windowSize minute, export every 1 minute tps 89 | val process10s = stream 90 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize))) 91 | .process(new AdjustLateTpsProcessAllWindowFunction(windowSize , intervalSize)) 92 | 93 | process10s.print("10s") 94 | 95 | val tag = new OutputTag[String]("size") 96 | val side = process10s.getSideOutput(tag) 97 | 98 | // side tmp result to kafka 99 | val kafkaSink = KafkaSink.builder[String]() 100 | .setBootstrapServers(bootstrapServer) 101 | .setRecordSerializer(KafkaRecordSerializationSchema.builder[String]() 102 | .setTopic(topic +"_side_sink") 103 | .setValueSerializationSchema(new SimpleStringSchema()) 104 | .build() 105 | ) 106 | .build() 107 | 108 | // add sink 109 | side.sinkTo(kafkaSink) 110 | 111 | // execute task 112 | env.execute("LateTps") 113 | } 114 | 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/common/MySqlDateTimeConverter.java: -------------------------------------------------------------------------------- 1 | package com.venn.common; 2 | 3 | import io.debezium.spi.converter.CustomConverter; 4 | import io.debezium.spi.converter.RelationalColumn; 5 | import org.apache.kafka.connect.data.SchemaBuilder; 6 | 7 | import java.time.*; 8 | import java.time.format.DateTimeFormatter; 9 | import java.util.Properties; 10 | /** 11 | * @Classname MySqlDateTimeConverter 12 | * @Description TODO 13 | * @Date 2024/3/7 14 | * @Created by venn 15 | */ 16 | public class MySqlDateTimeConverter implements CustomConverter{ 17 | 18 | 19 | private DateTimeFormatter dateFormatter = DateTimeFormatter.ISO_DATE; 20 | 21 | private DateTimeFormatter timeFormatter = DateTimeFormatter.ISO_TIME; 22 | 23 | private DateTimeFormatter datetimeFormatter = DateTimeFormatter.ISO_DATE_TIME; 24 | 25 | private DateTimeFormatter timestampFormatter = DateTimeFormatter.ISO_DATE_TIME; 26 | 27 | private ZoneId timestampZoneId = ZoneId.systemDefault(); 28 | 29 | @Override 30 | public void configure(Properties props) { 31 | 32 | } 33 | 34 | @Override 35 | public void converterFor(RelationalColumn column, ConverterRegistration registration) { 36 | 37 | String sqlType = column.typeName().toUpperCase(); 38 | 39 | SchemaBuilder schemaBuilder = null; 40 | 41 | Converter converter = null; 42 | 43 | if ("DATE".equals(sqlType)) { 44 | 45 | schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.date.string"); 46 | 47 | converter = this::convertDate; 48 | 49 | } 50 | 51 | if ("TIME".equals(sqlType)) { 52 | 53 | schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.time.string"); 54 | 55 | converter = this::convertTime; 56 | 57 | } 58 | 59 | if ("DATETIME".equals(sqlType)) { 60 | 61 | schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.datetime.string"); 62 | 63 | converter = this::convertDateTime; 64 | 65 | 66 | } 67 | 68 | if ("TIMESTAMP".equals(sqlType)) { 69 | 70 | schemaBuilder = SchemaBuilder.string().optional().name("com.darcytech.debezium.timestamp.string"); 71 | 72 | converter = this::convertTimestamp; 73 | 74 | } 75 | 76 | if (schemaBuilder != null) { 77 | 78 | registration.register(schemaBuilder, converter); 79 | 80 | } 81 | 82 | } 83 | 84 | 85 | private String convertDate(Object input) { 86 | 87 | if (input == null) return null; 88 | 89 | if (input instanceof LocalDate) { 90 | 91 | return dateFormatter.format((LocalDate) input); 92 | 93 | } 94 | 95 | if (input instanceof Integer) { 96 | 97 | LocalDate date = LocalDate.ofEpochDay((Integer) input); 98 | 99 | return dateFormatter.format(date); 100 | 101 | } 102 | 103 | return String.valueOf(input); 104 | 105 | } 106 | 107 | 108 | private String convertTime(Object input) { 109 | 110 | if (input == null) return null; 111 | 112 | if (input instanceof Duration) { 113 | 114 | Duration duration = (Duration) input; 115 | 116 | long seconds = duration.getSeconds(); 117 | 118 | int nano = duration.getNano(); 119 | 120 | LocalTime time = LocalTime.ofSecondOfDay(seconds).withNano(nano); 121 | 122 | return timeFormatter.format(time); 123 | 124 | } 125 | 126 | return String.valueOf(input); 127 | 128 | } 129 | 130 | 131 | private String convertDateTime(Object input) { 132 | 133 | if (input == null) return null; 134 | 135 | if (input instanceof LocalDateTime) { 136 | 137 | return datetimeFormatter.format((LocalDateTime) input).replaceAll("T", " "); 138 | 139 | } 140 | 141 | return String.valueOf(input); 142 | 143 | } 144 | 145 | 146 | private String convertTimestamp(Object input) { 147 | 148 | if (input == null) return null; 149 | 150 | if (input instanceof ZonedDateTime) { 151 | 152 | // mysql的timestamp会转成UTC存储,这里的zonedDatetime都是UTC时间 153 | 154 | ZonedDateTime zonedDateTime = (ZonedDateTime) input; 155 | 156 | LocalDateTime localDateTime = zonedDateTime.withZoneSameInstant(timestampZoneId).toLocalDateTime(); 157 | 158 | return timestampFormatter.format(localDateTime).replaceAll("T", " "); 159 | 160 | } 161 | return String.valueOf(input); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/scala/com/venn/stream/api/dayWindow/CurrentDayPvCount.scala: -------------------------------------------------------------------------------- 1 | package com.venn.stream.api.dayWindow 2 | 3 | import java.io.File 4 | import java.text.SimpleDateFormat 5 | 6 | import com.venn.common.Common 7 | import com.venn.source.TumblingEventTimeWindows 8 | import com.venn.util.CheckpointUtil 9 | import org.apache.flink.api.common.functions.ReduceFunction 10 | import org.apache.flink.api.common.serialization.SimpleStringSchema 11 | import org.apache.flink.api.scala._ 12 | import org.apache.flink.contrib.streaming.state.RocksDBStateBackend 13 | import org.apache.flink.formats.json.JsonNodeDeserializationSchema 14 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode 15 | import org.apache.flink.streaming.api.TimeCharacteristic 16 | import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor 17 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 18 | import org.apache.flink.streaming.api.windowing.time.Time 19 | import org.apache.flink.streaming.api.windowing.triggers.{ContinuousEventTimeTrigger, ContinuousProcessingTimeTrigger} 20 | import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, FlinkKafkaProducer} 21 | 22 | /** 23 | * Created by venn on 19-5-23. 24 | * 25 | * use TumblingEventTimeWindows count current day pv 26 | * for test, update day window to minute window 27 | * 28 | * .windowAll(TumblingEventTimeWindows.of(Time.minutes(1), Time.seconds(0))) 29 | * TumblingEventTimeWindows can ensure count o minute event, 30 | * and time start at 0 second (like : 00:00:00 to 00:00:59) 31 | * 32 | */ 33 | object CurrentDayPvCount { 34 | 35 | def main(args: Array[String]): Unit = { 36 | // environment 37 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 38 | // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 39 | env.setParallelism(1) 40 | // if ("\\".equals(File.pathSeparator)) { 41 | // val rock = new RocksDBStateBackend(Common.CHECK_POINT_DATA_DIR) 42 | // env.setStateBackend(rock) 43 | // // checkpoint interval 44 | // env.enableCheckpointing(10000) 45 | // } 46 | CheckpointUtil.setCheckpoint(env, "rocksdb", Common.CHECK_POINT_DATA_DIR, 10) 47 | 48 | val topic = "current_day" 49 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS") 50 | val kafkaSource = new FlinkKafkaConsumer[ObjectNode](topic, new JsonNodeDeserializationSchema(), Common.getProp) 51 | val sink = new FlinkKafkaProducer[String](topic + "_out", new SimpleStringSchema(), Common.getProp) 52 | sink.setWriteTimestampToKafka(true) 53 | 54 | val stream = env.addSource(kafkaSource) 55 | .map(node => { 56 | Eventx(node.get("id").asText(), node.get("createTime").asText()) 57 | }) 58 | .assignAscendingTimestamps(event => sdf.parse(event.createTime).getTime) 59 | .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Eventx](Time.seconds(60)) { 60 | override def extractTimestamp(element: Eventx): Long = { 61 | sdf.parse(element.createTime).getTime 62 | } 63 | }) 64 | // window is one minute, start at 0 second 65 | //.windowAll(TumblingEventTimeWindows.of(Time.minutes(1), Time.seconds(0))) 66 | // window is one hour, start at 0 second 67 | // .windowAll(TumblingEventTimeWindows.of(Time.hours(1), Time.seconds(0))) 68 | // window is one day, start at 0 second, todo there have a bug(FLINK-11326), can't use negative number, 1.8 修复 69 | // .windowAll(TumblingEventTimeWindows.of(Time.days(1))) 70 | .windowAll(TumblingEventTimeWindows.of(Time.days(1), Time.hours(-8))) 71 | // every event one minute 如果使用了trigger,窗口函数每次执行,窗口中的所有元素都会参与计算 72 | // .trigger(ContinuousEventTimeTrigger.of(Time.seconds(3800))) 73 | // every process one minute 74 | .trigger(ContinuousProcessingTimeTrigger.of(Time.seconds(10))) 75 | // every event, export current value, 76 | // .trigger(CountTrigger.of(1)) 77 | .reduce(new ReduceFunction[Eventx] { 78 | 79 | 80 | override def reduce(event1: Eventx, event2: Eventx): Eventx = { 81 | print(event2.toString) 82 | 83 | // 将结果中,id的最小值和最大值输出 84 | new Eventx(event1.id, event2.id, event1.amt + event2.amt) 85 | } 86 | }) 87 | // format output even, connect min max id, add current timestamp 88 | // .map(event => Event(event.id + "-" + event.createTime, sdf.format(System.currentTimeMillis()), event.count)) 89 | stream.print("result : ") 90 | 91 | // execute job 92 | env.execute("CurrentDayCount") 93 | } 94 | 95 | } 96 | 97 | case class Event(id: String, createTime: String, count: Int = 1) {} 98 | 99 | --------------------------------------------------------------------------------