├── README.md ├── pom.xml └── src ├── main ├── java │ └── com │ │ └── singgel │ │ └── bigdata │ │ └── flinksinkhbase │ │ ├── FlinkRunner.java │ │ ├── HbaseSink.java │ │ ├── common │ │ ├── CustomJsonDeserializationSchema.java │ │ ├── HbaseUtil.java │ │ ├── JobConfigManager.java │ │ ├── JoinTable.java │ │ └── ValueFormat.java │ │ └── config │ │ ├── HbaseConfig.java │ │ ├── JobConfig.java │ │ └── KafkaConfig.java └── resources │ ├── application.conf │ └── logback.xml └── test ├── java └── com │ └── singgel │ └── bigdata │ └── recommend │ ├── HbaseByteTest.java │ ├── HbaseGetTest.java │ └── JobConfigTest.java └── resources └── logback.xml /README.md: -------------------------------------------------------------------------------- 1 | # flink-kafka-hbase 2 | 功能:实现kafka消息实时落地hbase,支持csv/json字符串两种格式的消息,支持自定义组合rowkey,列簇和列名,支持按照kafka消息流中不同字段join不同的hbase表,并自定义写入列簇和列(join时需评估一下性能) 3 | 支持at least once语义 4 | 外部依赖:apollo配置中心,本项目依靠配置驱动,配置存储在apollo配置中心 5 | 配置: 6 | ``` 7 | { 8 | "indexColumnMapping": { --indexColumnMapping即CSV格式消息的key和value按照value里的分隔符拼接后再分割后下标及写入hbase列的对应关系 9 | "0": "basic:time", --第0列始终是kafka消息的key,如果不需要可以不指定 10 | "1": "basic:user_id", 11 | "2": "basic:session_id", 12 | "3": "basic:test_mission_id", 13 | "4": "basic:stratege_name", 14 | "5": "basic:status_type", 15 | "6": "basic:status_id", 16 | "7": "basic:position", 17 | "8": "basic:like_count", 18 | "9": "basic:retweet_count", 19 | "10": "basic:reply_count", 20 | "11": "basic:fav_count", 21 | "12": "basic:reward_amount", 22 | "13": "basic:reward_user_count", 23 | "14": "basic:status_hot_score", 24 | "15": "basic:status_hot_score_norm", 25 | "16": "basic:user_score", 26 | "17": "basic:use_score_norm", 27 | "18": "basic:stock_score", 28 | "19": "basic:stock_score_norm", 29 | "20": "basic:tag", 30 | "21": "basic:tag_score", 31 | "22": "basic:stock_symbol", 32 | "23": "basic:ip", 33 | "24": "basic:device", 34 | "25": "basic:country_name", 35 | "26": "basic:city_name", 36 | "27": "basic:topic_score", 37 | "28": "basic:rerank_name", 38 | "29": "basic:author_block_count", 39 | "30": "basic:percent", 40 | "31": "basic:random_id", 41 | "32": "basic:rank_score", 42 | "33": "basic:quote_string", 43 | "34": "basic:click_num", 44 | "35": "basic:show_num", 45 | "36": "basic:tag_short_term_click", 46 | "37": "basic:tag_short_term_show", 47 | "38": "basic:tag_long_term_click", 48 | "39": "basic:tag_long_term_show", 49 | "40": "basic:block_count", 50 | "41": "basic:context_info", 51 | "42": "basic:recent_behavior", 52 | "43": "basic:basic_string", 53 | "44": "basic:mention_stock_rank", 54 | "45": "basic:text_quality_score", 55 | "46": "basic:last_nc_context", 56 | "47": "basic:keywords" 57 | }, 58 | "rowKeyDelimiter": "#", --如果rowkey是多个列的拼接,则需指定的拼接符 59 | "rowKeyColumns": [ --rowkey组成的列 60 | "basic:user_id", 61 | "basic:statusId" 62 | ], 63 | "tableName": "cy_test", --数据流要写入hbase表的表名(如不存在会自动创建) 64 | "kafkaConfig": { --flink接入kafka数据源的配置 65 | "bootstrapServers": "singgel:9092", --kafka的broker list 66 | "topic": "recommend2.statistics", --需要接入的topic 67 | "groupId": "flink_recommend2_statistic_join_test2", --flink中消费kafka topic的groupId 68 | "delimiter": "|", --kafka消息value的分隔符,当valueFormat=CSV时必须指定 69 | "valueFormat": "CSV", --kafka消息value的格式,目前支持"CSV"和"JSON"两种 70 | "optionalProps": {} --其他kafka消费者配置 71 | }, 72 | "hbaseConfig": { --写入的hbase集群配置 73 | "zookerperQuorum": "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com", 74 | "port": "2181", 75 | "zookeeperZondeParent": "/hbase-unsecure", --hbase在zookeeper中的根目录节点名称(注意:咱内部cdh集群是/hbase,此处是ambari集群hbase的配置) 76 | "batchCount": 100, --批量写入的条数,与interval条件满足其一就触发写入,注:当接入的topic数据源生产速率较小时且无join时,可以设置为1,逐条写入 77 | "interval": 5000, --批量写入的间隔时间 78 | "optionalProp": {} --其他hbase设置 79 | }, 80 | "jobName": "recommend_feature_hbase_sink_test", --flink job的名称 81 | "parallelism": 8, --flink任务执行时的平行度 82 | "jarName": "flink-kafka-hbase-1.0-SNAPSHOT-jar-with-dependencies.jar", --执行flink任务的jar包,当通过实时平台界面提交flinkjob时需要指定 83 | "joinTables": [ --需要join的表,可以指定多个,可以为空;当要join多个表时,需要评估一下性能 84 | { 85 | "tableName": "user_feature", --需要join的hbase表 86 | "joinKey": "basic:userId", --join的字段,需要在"indexColumnMapping的values中,且是joinTable的rowKey 87 | "columnsMapping": { --join表中的列和要写入表中的列的对应关系,key->fromFamily:fromColumn,value->toFamily:toColumn,from和to的列簇和列不需一致 88 | "basic:pagerank": "basic :pagerank", 89 | "basic:country": "basic:country", 90 | "basic:province": "basic:province", 91 | "basic:city": "basic:city", 92 | "basic:mobile": "basic:mobile", 93 | "basic:follower_cluster": "basic:follower_cluster", 94 | "basic:quality_cluster": "basic:quality_cluster", 95 | "basic:symbol_cluster": "basic:symbol_cluster", 96 | "basic:topic_cluster": "basic:topic_cluster", 97 | "basic:stock_click7": "basic:stock_click7", 98 | "basic:stock_show7": "basic:stock_show7", 99 | "basic:stock_click30": "basic:stock_click30", 100 | "basic:stock_show30": "basic:stock_show30", 101 | "basic:symbol_page_enter": "basic:symbol_page_enter", 102 | "basic:symbol_new_status": "basic:symbol_new_status", 103 | "basic:symbol_hot": "basic:symbol_hot", 104 | "basic:symbol_finance": "basic:symbol_finance", 105 | "basic:symbol_news": "basic:symbol_news", 106 | "basic:symbol_notice": "basic:symbol_notice", 107 | "basic:symbol_general": "basic:symbol_general", 108 | "basic:symbol_page_view": "basic:symbol_page_view", 109 | "basic:symbol_page_origin": "basic:symbol_page_origin", 110 | "basic:attention_mark": "basic:attention_mark", 111 | "basic:rebalance_num": "basic:rebalance_num", 112 | "basic:topic_personal_short_click": "basic:topic_personal_short_click", 113 | "basic:topic_personal_short_show": "basic:topic_personal_short_show", 114 | "basic:topic_personal_long_click": "basic:topic_personal_long_click", 115 | "basic:topic_personal_long_show": "basic:topic_personal_long_show", 116 | "basic:dislike_1st": "basic:dislike_1st", 117 | "basic:dislike_2st": "basic:dislike_2st", 118 | "basic:dislike_3st": "basic:dislike_3st", 119 | "basic:dislike_4st": "basic:dislike_4st", 120 | "basic:dislike_5st": "basic:dislike_5st", 121 | "basic:familar_1st": "basic:familar_1st", 122 | "basic:familar_2st": "basic:familar_2st", 123 | "basic:familar_3st": "basic:familar_3st", 124 | "basic:familar_4st": "basic:familar_4st", 125 | "basic:familar_5st": "basic:familar_5st", 126 | "basic:like_1st": "basic:like_1st", 127 | "basic:like_2st": "basic:like_2st", 128 | "basic:like_3st": "basic:like_3st", 129 | "basic:like_4st": "basic:like_4st", 130 | "basic:like_5st": "basic:like_5st", 131 | "basic:unfamilar_1st": "basic:unfamilar_1st", 132 | "basic:unfamilar_2st": "basic:unfamilar_2st", 133 | "basic:unfamilar_3st": "basic:unfamilar_3st", 134 | "basic:unfamilar_4st": "basic:unfamilar_4st", 135 | "basic:unfamilar_5st": "basic:unfamilar_5st", 136 | "basic:headline_down_cnt": "basic:headline_down_cnt", 137 | "basic:headline_up_cnt": "basic:headline_up_cnt", 138 | "basic:optional_cnt": "basic:optional_cnt", 139 | "basic:dynamic_cnt": "basic:dynamic_cnt", 140 | "basic:quotation_cnt": "basic:quotation_cnt", 141 | "basic:base_rate": "basic:base_rate", 142 | "basic:mark_gegu_enter": "basic:mark_gegu_enter", 143 | "basic:mark_share_sum": "basic:mark_share_sum", 144 | "basic:mark_head_dislike_sum": "basic:mark_head_dislike_sum", 145 | "basic:mark_status_post_user_sum": "basic:mark_status_post_user_sum", 146 | "basic:mark_search_sum": "basic:mark_search_sum", 147 | "basic:mark_debate_post_user_num": "basic:mark_debate_post_user_num", 148 | "basic:author_click_week": "basic:author_click_week", 149 | "basic:author_show_week": "basic:author_show_week", 150 | "basic:author_click_month": "basic:author_click_month", 151 | "basic:author_show_month": "basic:author_show_month" 152 | } 153 | }, 154 | { 155 | "tableName": "status_feature_string", 156 | "joinKey": "basic:statusId", 157 | "columnsMapping": { 158 | "basic:user_id": "basic:user_id", 159 | "basic:symbol_id": "basic:symbol_id", 160 | "basic:created_at": "basic:created_at", 161 | "basic:source": "basic:source", 162 | "basic:retweet_status_id": "basic:retweet_status_id", 163 | "basic:paid_mention_id": "basic:paid_mention_id", 164 | "basic:retweet_user_id": "basic:retweet_user_id", 165 | "basic:retweet_symbol_id": "basic:retweet_symbol_id", 166 | "basic:truncate": "basic:truncate", 167 | "basic:flags": "basic:flags", 168 | "basic:expired_at": "basic:expired_at", 169 | "basic:title_length": "basic:title_length", 170 | "basic:title_hash": "basic:title_hash", 171 | "basic:title_flag": "basic:title_flag", 172 | "basic:text_length": "basic:text_length", 173 | "basic:pic_count": "basic:pic_count", 174 | "basic:type": "basic:type", 175 | "basic:meta_classes": "basic:meta_classes", 176 | "basic:pic_score": "basic:pic_score", 177 | "basic:domain": "basic:domain", 178 | "basic:url_hash": "basic:url_hash", 179 | "basic:character_percent": "basic:character_percent", 180 | "basic:symbol": "basic:symbol", 181 | "basic:keyword": "basic:keyword", 182 | "basic:match_word": "basic:match_word", 183 | "basic:keyword_title": "basic:keyword_title", 184 | "basic:keyword_des": "basic:keyword_des", 185 | "basic:symbol_title": "basic:symbol_title", 186 | "basic:symbol_sim_title": "basic:symbol_sim_title", 187 | "basic:symbol_des": "basic:symbol_des", 188 | "basic:symbol_sim_des": "basic:symbol_sim_des", 189 | "basic:symbol_content": "basic:symbol_content", 190 | "basic:symbol_sim_content": "basic:symbol_sim_content" 191 | } 192 | } 193 | ] 194 | } 195 | ``` 196 | 197 | 在flink任务启动时,会去apollo配置中心取指定的配置,根据配置执行任务。 198 | 199 | 关键实现HbaseSink代码如下: 200 | ``` 201 | @Slf4j 202 | public class HbaseSink extends RichSinkFunction implements CheckpointedFunction { 203 | 204 | private final JobConfig jobConfig; 205 | 206 | private HbaseUtil hbaseUtil; 207 | 208 | private long currentTime = System.currentTimeMillis(); 209 | 210 | /** 211 | * 在flink任务自动重试时,会先恢复state中的数据;如果是cancel掉flink任务,重新手动提交,则state会清空 212 | */ 213 | private transient ListState checkpointedState; 214 | 215 | private List nodes = new ArrayList<>(); 216 | 217 | private static ObjectMapper MAPPER = new ObjectMapper(); 218 | 219 | private StringBuilder sbLog = new StringBuilder(); 220 | 221 | public HbaseSink(JobConfig jobConfig) { 222 | this.jobConfig = jobConfig; 223 | } 224 | 225 | 226 | @Override 227 | public void open(Configuration parameters) throws Exception { 228 | super.open(parameters); 229 | this.hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig()); 230 | } 231 | 232 | /** 233 | * 在手动cancel和程序内部出错重试时都会触发close方法,在close方法中将nodes中的数据先flush,防止在两次写入之间,checkpoint点之前的数据丢失 234 | * 但会有数据重复,checkpoint点之后到发生故障时的数据会重复,如下示意图: 235 | *
236 |      *                       flush
237 |      *                        ^
238 |      *                       /
239 |      *            +--------------------------+
240 |      * ------write------checkpoint-----------down----
241 |      *                           +-----------+
242 |      *                                 ^
243 |      *                                /
244 |      *                           will repeat
245 |      * 
246 | *

247 | * 但对于写入具有幂等性的业务,数据重复写入不会影响结果 248 | * 249 | * @throws Exception 250 | */ 251 | @Override 252 | public void close() throws Exception { 253 | log.debug("execute sink close method"); 254 | super.close(); 255 | batchFlush(); 256 | if (this.hbaseUtil.getConnection() != null) { 257 | try { 258 | this.hbaseUtil.getConnection().close(); 259 | } catch (Exception e) { 260 | log.warn("connection close failed. error:{} ", e.getMessage()); 261 | } 262 | } 263 | } 264 | 265 | /** 266 | * 每条记录调用一次此方法 267 | * 268 | * @param node 269 | * @param context 270 | * @throws Exception 271 | */ 272 | @Override 273 | public void invoke(ObjectNode node, Context context) throws Exception { 274 | String partition = node.get("metadata").get("partition").asText(); 275 | String offset = node.get("metadata").get("offset").asText(); 276 | String value = node.get("value").asText(); 277 | log.debug("partition->{}|offset->{}|value->{}", partition, offset, value); 278 | nodes.add(node); 279 | if (nodes.size() >= jobConfig.getHbaseConfig().getBatchCount() || 280 | (System.currentTimeMillis() - currentTime > jobConfig.getHbaseConfig().getInterval() && nodes.size() > 0)) { 281 | batchFlush(); 282 | } 283 | } 284 | 285 | /** 286 | * 将{@link HbaseSink#nodes}中的数据批量写入 287 | * 288 | * @throws IOException 289 | */ 290 | private void batchFlush() throws IOException { 291 | long start = System.currentTimeMillis(); 292 | List puts = convertBatch(nodes); 293 | if (puts.size() == 0) { 294 | return; 295 | } 296 | long startPut = System.currentTimeMillis(); 297 | hbaseUtil.putBatchData(jobConfig.getTableName(), puts); 298 | long end = System.currentTimeMillis(); 299 | sbLog.append(String.format(" | batch_put(%d) cost %d ms", puts.size(), end - startPut)); 300 | sbLog.append(String.format(" | batch_total(%d) cost %d ms", puts.size(), end - start)); 301 | sbLog.append(String.format(" | per record cost %d ms", (end - start) / puts.size())); 302 | log.debug(sbLog.toString()); 303 | currentTime = System.currentTimeMillis(); 304 | sbLog = new StringBuilder(); 305 | nodes.clear(); 306 | } 307 | 308 | /** 309 | * 批量处理 310 | * 311 | * @param objectNodes 一批数据 312 | * @return 返回批量的Put 313 | */ 314 | private List convertBatch(List objectNodes) throws IOException { 315 | Map> puts = new HashMap<>(objectNodes.size()); 316 | //存储每个需要join的表中这个批次的rowkey的值 317 | Map> joinKeys = new HashMap<>(objectNodes.size()); 318 | 319 | for (ObjectNode node : objectNodes) { 320 | Map keyValues = getKeyValues(node); 321 | 322 | //获取拼接的rowKey 323 | List rowKeyValues = new ArrayList<>(); 324 | jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e))); 325 | if (rowKeyValues.stream().anyMatch(Objects::isNull)) { 326 | //如果组合rowKey的字段中有null,则过滤掉此记录 327 | log.warn("columns which consist of rowKey has null value"); 328 | continue; 329 | } 330 | String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues); 331 | Put put = new Put(Bytes.toBytes(rowKey)); 332 | 333 | //获取这个joinTable表中这个批次所有需要join的key 334 | for (JoinTable joinTable : jobConfig.getJoinTables()) { 335 | 336 | joinKeys.compute(joinTable.getTableName(), (k, v) -> { 337 | //keyValues.get(joinTable.getJoinKey()的值有可能为null,需做空判断 338 | if (keyValues.get(joinTable.getJoinKey()) != null) { 339 | if (v == null) { 340 | v = new HashSet<>(); 341 | v.add(keyValues.get(joinTable.getJoinKey())); 342 | } else { 343 | v.add(keyValues.get(joinTable.getJoinKey())); 344 | } 345 | } 346 | return v; 347 | }); 348 | } 349 | 350 | 351 | //原始topic需要写入的列 352 | keyValues.forEach((k, v) -> { 353 | String family = k.split(":")[0]; 354 | String column = k.split(":")[1]; 355 | put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), v != null ? Bytes.toBytes(v) : null); 356 | }); 357 | 358 | puts.put(put, keyValues); 359 | } 360 | 361 | //当需要join时执行下面操作 362 | for (JoinTable joinTable : jobConfig.getJoinTables()) { 363 | //取出这个joinTable表中这个批次所有需要join的key 364 | Set keys = joinKeys.get(joinTable.getTableName()); 365 | List gets = new ArrayList<>(); 366 | //将key和result一一对应 367 | Map keyResults = new HashMap<>(keys.size()); 368 | 369 | //生成需要批量get的List 370 | keys.forEach(e -> { 371 | Get get = new Get(Bytes.toBytes(e)); 372 | joinTable.getColumnsMapping().forEach((k, v) -> { 373 | get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 374 | }); 375 | gets.add(get); 376 | }); 377 | 378 | 379 | long start = System.currentTimeMillis(); 380 | //执行批量get 381 | Result[] results = hbaseUtil.batchGet(joinTable.getTableName(), gets); 382 | for (Result result : results) { 383 | if (result != null) { 384 | keyResults.put(Bytes.toString(result.getRow()), result); 385 | } 386 | } 387 | long end = System.currentTimeMillis(); 388 | 389 | sbLog.append(String.format("| batch_get %s(%d) %d ms", joinTable.getTableName(), keys.size(), (end - start))); 390 | //对之前原始写入的每个put,获取这个表需要join的rowKey的result,然后将result中的值根据joinTable的配置添加到put的对应列中 391 | puts.forEach((put, keyValues) -> { 392 | Result result = keyResults.get(keyValues.get(joinTable.getJoinKey())); 393 | if (result != null) { 394 | joinTable.getColumnsMapping().forEach((k, v) -> { 395 | byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 396 | put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue); 397 | }); 398 | } 399 | }); 400 | } 401 | 402 | 403 | return new ArrayList<>(puts.keySet()); 404 | } 405 | 406 | /** 407 | * 根据配置中给定的列值对应关系,将每条消息解析成格式,key为配置中指定的列名(包含列簇) 408 | * 目前支持两种消息格式:CSV和JSON格式的字符串型数据, 409 | * 在处理消息时,kafka消息的key默认会被集成到value中, 对于CSV格式,kafka消息的key处在index=0的位置;对于JSON格式,kafka消息的key对应默认的kafka_key字段 410 | * 411 | * @param node flink接入的kafka消息 412 | * @return 返回字段名称对应的值 413 | */ 414 | private Map getKeyValues(ObjectNode node) { 415 | Map indexColumns = jobConfig.getIndexColumnMapping(); 416 | String key = node.get("key") == null ? "" : node.get("key").asText(); 417 | String value = node.get("value") == null ? "" : node.get("value").asText(); 418 | 419 | 420 | Map keyValues = new HashMap<>(8); 421 | 422 | ValueFormat valueFormat = jobConfig.getKafkaConfig().getValueFormat(); 423 | switch (valueFormat) { 424 | case CSV: 425 | //将key和value拼接起来,配置时kafka的key值作为下标的第0个 426 | String input = key + jobConfig.getKafkaConfig().getDelimiter() + value; 427 | String[] columnValues = StringUtils.splitPreserveAllTokens(input, jobConfig.getKafkaConfig().getDelimiter()); 428 | 429 | //将index对应的列值写入对应的列名下,列名包含了列簇名,形如:family:qualifier 430 | for (Map.Entry entry : indexColumns.entrySet()) { 431 | try { 432 | keyValues.put(entry.getValue(), columnValues[Integer.valueOf(entry.getKey())]); 433 | } catch (Exception e) { 434 | 435 | log.warn("index {} out of boundary.", entry.getKey(), e); 436 | } 437 | } 438 | 439 | break; 440 | case JSON: 441 | default: 442 | //将kafka的key加入node,统一处理 443 | try { 444 | 445 | ObjectNode jsonNode = (ObjectNode) MAPPER.readTree(value); 446 | 447 | jsonNode.put("kafka_key", key); 448 | 449 | //将配置中指定的列值写入对应的列名下,列名包含了列簇名,形如:family:qualifier 450 | indexColumns.forEach((k, v) -> { 451 | if (jsonNode.get(k) != null) { 452 | keyValues.put(v, jsonNode.get(k).asText()); 453 | } 454 | }); 455 | } catch (IOException e) { 456 | keyValues.clear(); 457 | indexColumns.forEach((k, v) -> keyValues.put(v, null)); 458 | String partition = node.get("metadata").get("partition").asText(); 459 | String offset = node.get("metadata").get("offset").asText(); 460 | String topic = node.get("metadata").get("topic").asText(); 461 | log.warn("this json record failed.topic->{},partition->{},offset->{},value->{}", topic, partition, offset, value); 462 | } 463 | break; 464 | } 465 | return keyValues; 466 | } 467 | 468 | /** 469 | * 执行频率和{@link FlinkRunner}中指定的checkpoint间隔一致 470 | * 471 | * @param context 472 | * @throws Exception 473 | */ 474 | @Override 475 | public void snapshotState(FunctionSnapshotContext context) throws Exception { 476 | checkpointedState.clear(); 477 | for (ObjectNode element : nodes) { 478 | checkpointedState.add(element); 479 | } 480 | log.debug("execute snapshot at {}", System.currentTimeMillis()); 481 | } 482 | 483 | /** 484 | * 在程序内部出错重启时,如果调用了snapshotState方法,则会恢复checkpointedState中的数据,如果是手动cancel或重试几次失败后重新提交任务,此时 485 | * 的checkpointedState会是新的对象,里面没有数据 486 | * 487 | * @param context 488 | * @throws Exception 489 | */ 490 | @Override 491 | public void initializeState(FunctionInitializationContext context) throws Exception { 492 | ListStateDescriptor descriptor = 493 | new ListStateDescriptor<>( 494 | "hbase-sink-cp", 495 | TypeInformation.of(new TypeHint() { 496 | })); 497 | 498 | checkpointedState = context.getOperatorStateStore().getListState(descriptor); 499 | 500 | if (context.isRestored()) { 501 | for (ObjectNode element : checkpointedState.get()) { 502 | nodes.add(element); 503 | } 504 | } 505 | log.info("initialState {} record", nodes.size()); 506 | } 507 | } 508 | ``` 509 | 510 | 开发过程中遇到的问题主要有两点,一是处理速度,二是批量处理时出错数据丢失的问题。 511 | 512 | 对于处理速度的优化: 513 | 514 | (1)由最开始的单条写入改成批量写入,但在获取joinTable的列时依然是逐条获取,每个rowkey调用一次get方法,比较费时 515 | 516 | (2)将joinTable的逐条get,改成批量get,速度提升了4-5倍,一是因为减少了提交请求的次数,加快返回速度;二是因为短时间内recommend2.statistics记录user_id和status_id分别都有重复,批量时可以减少实际查询rowkey的个数进而节省时间。 517 | 518 | (3)尽管做了以上两点优化,但速度还是很慢,经过打日志发现,主要是user_feature这个表获取rowkey的值太慢,于是又在发送get请求时做了如下优化: 519 | 520 | 因为user_feature的所有列都插入到集成的表中,一开始就没有在get请求时指定要获取的列簇和列名,优化就是在提交get请求时,指定所有需要获取的列簇和列名,这样明显快很多,大概提升10倍,此时写入一条join后的数据大概耗时2-3ms 521 | | batch_get user_feature(139) 134 ms| batch_get status_feature_string(160) 59 ms | batch_put(200) cost 195 ms | batch_total(200) cost 433 ms | per record cost 2 ms 522 | | batch_get user_feature(134) 132 ms| batch_get status_feature_string(169) 56 ms | batch_put(200) cost 201 ms | batch_total(200) cost 434 ms | per record cost 2 ms 523 | 524 | 对于处理出错数据丢失的问题: 525 | 526 | 数据丢失的场景:(1)当HbaseSink中调用了多次invoke方法,nodes中累积了一定的数量,但还没有触发写入操作,此时flink程序由于某种原因失败了自动重启,之前nodes中累积的记录就会丢失。 527 | 528 | 怎样做到数据不丢失? 529 | 530 | (1)让HbaseSink实现CheckpointedFunction接口,实现snapshotState和initializeState方法,snapshotState的调用频率和FlinkRunner中指定的checkpoint的频率一致,每次checkpoint会提交kafka的offset,并执行snapshotState方法,在snapshotState方法中,会将nodes中的元素加入到checkpointState中,当flink程序失败自动重启后,initializeState方法会从checkpointState中恢复nodes中的数据,接着处理。 531 | 532 | (2)当flink任务重试几次失败导致任务最终失败或者手动停止flink任务,再重新提交flink任务时,checkpointedState会是新的对象,不会保存上次任务失败或停止时nodes中的数据,这种情况依然会丢数据,因为程序失败自动重启和手动停止时都会调用close方法,因此在close方法中调用batchFlush方法,先写入再关闭。但重新启动时,从上次checkpoint到停止时的消息会重复处理。 533 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.singgel.bigdata 8 | flink-kafka-hbase 9 | 1.0 10 | 11 | 12 | 1.7.0 13 | 14 | 15 | 16 | 17 | 18 | org.apache.flink 19 | flink-streaming-scala_2.11 20 | ${flink.version} 21 | 22 | 23 | 24 | org.apache.flink 25 | flink-table_2.11 26 | ${flink.version} 27 | 28 | 29 | org.apache.flink 30 | flink-json 31 | 1.7.0 32 | 33 | 34 | 35 | org.apache.flink 36 | flink-connector-kafka_2.11 37 | ${flink.version} 38 | 39 | 40 | 41 | com.fasterxml.jackson.core 42 | jackson-databind 43 | 2.13.4.2 44 | 45 | 46 | 47 | org.apache.flink 48 | flink-hbase_2.11 49 | ${flink.version} 50 | 51 | 52 | 53 | org.apache.hadoop 54 | hadoop-common 55 | 3.2.4 56 | 57 | 58 | 59 | com.squareup.okhttp3 60 | okhttp 61 | 3.12.1 62 | 63 | 64 | 65 | junit 66 | junit 67 | 4.13.1 68 | test 69 | 70 | 71 | 72 | com.typesafe 73 | config 74 | 1.3.1 75 | 76 | 77 | 78 | org.projectlombok 79 | lombok 80 | 1.18.4 81 | provided 82 | 83 | 84 | 85 | 86 | org.slf4j 87 | slf4j-api 88 | 1.7.25 89 | 90 | 91 | 92 | ch.qos.logback 93 | logback-classic 94 | 1.2.3 95 | 96 | 97 | 98 | org.slf4j 99 | jcl-over-slf4j 100 | 1.7.25 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | ${artifactId}-${version} 110 | 111 | 112 | org.apache.maven.plugins 113 | maven-compiler-plugin 114 | 115 | 8 116 | 8 117 | 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-assembly-plugin 122 | 2.4.1 123 | 124 | 125 | 126 | jar-with-dependencies 127 | 128 | 129 | 130 | 131 | com.singgel.bigdata.flinksinkhbase.FlinkRunner 132 | 133 | 134 | 135 | 136 | 137 | make-assembly 138 | 139 | package 140 | 141 | single 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/FlinkRunner.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.common.CustomJsonDeserializationSchema; 4 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil; 5 | import com.singgel.bigdata.flinksinkhbase.common.JobConfigManager; 6 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.apache.flink.api.common.restartstrategy.RestartStrategies; 9 | import org.apache.flink.api.common.time.Time; 10 | import org.apache.flink.api.java.utils.ParameterTool; 11 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 12 | import org.apache.flink.streaming.api.CheckpointingMode; 13 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 14 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 15 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; 16 | import org.apache.hadoop.hbase.client.Put; 17 | 18 | import java.util.*; 19 | import java.util.concurrent.TimeUnit; 20 | 21 | /** 22 | * \* @author singgel 23 | * \* @created_at: 2019/3/24 下午1:49 24 | * \ 25 | */ 26 | @Slf4j 27 | public class FlinkRunner { 28 | 29 | public static void main(String[] args) throws Exception { 30 | 31 | String jobKey; 32 | try { 33 | final ParameterTool params = ParameterTool.fromArgs(args); 34 | jobKey = params.get("jobKey"); 35 | } catch (Exception e) { 36 | System.err.println("No jobKey specified. Please run 'FlinkRunner --jobKey '"); 37 | return; 38 | } 39 | 40 | JobConfig jobConfig = JobConfigManager.getConfigByKey(jobKey); 41 | jobConfig.validate(); 42 | 43 | HbaseUtil hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig()); 44 | hbaseUtil.prepareTable(jobConfig.getTableName(), jobConfig.families()); 45 | 46 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 47 | 48 | env.getConfig().registerKryoType(JobConfig.class); 49 | env.getConfig().registerKryoType(Put.class); 50 | env.getConfig().registerKryoType(HbaseUtil.class); 51 | 52 | Properties prop = jobConfig.getKafkaConfig().kafkaProps(); 53 | DataStreamSource dataStreamSource = env.addSource(new FlinkKafkaConsumer<>(jobConfig.getKafkaConfig().getTopic(), 54 | new CustomJsonDeserializationSchema(true), prop)); 55 | 56 | dataStreamSource.addSink(new HbaseSink(jobConfig)); 57 | 58 | //设置最大失败重启尝试次数及每次重启间隔时间 59 | env.setRestartStrategy(RestartStrategies.fixedDelayRestart( 60 | 3, 61 | Time.of(10L, TimeUnit.SECONDS) 62 | )); 63 | 64 | env.enableCheckpointing(JobConfig.CHECKPOINT_INTERVAR, CheckpointingMode.EXACTLY_ONCE); 65 | 66 | env.execute(jobConfig.getJobName()); 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/HbaseSink.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil; 4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable; 5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat; 6 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig; 7 | import lombok.extern.slf4j.Slf4j; 8 | import org.apache.commons.lang3.StringUtils; 9 | import org.apache.flink.api.common.state.ListState; 10 | import org.apache.flink.api.common.state.ListStateDescriptor; 11 | import org.apache.flink.api.common.typeinfo.TypeHint; 12 | import org.apache.flink.api.common.typeinfo.TypeInformation; 13 | import org.apache.flink.configuration.Configuration; 14 | import org.apache.flink.runtime.state.FunctionInitializationContext; 15 | import org.apache.flink.runtime.state.FunctionSnapshotContext; 16 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 17 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 18 | import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction; 19 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; 20 | import org.apache.hadoop.hbase.client.Get; 21 | import org.apache.hadoop.hbase.client.Put; 22 | import org.apache.hadoop.hbase.client.Result; 23 | import org.apache.hadoop.hbase.util.Bytes; 24 | 25 | import java.io.IOException; 26 | import java.util.*; 27 | 28 | /** 29 | * \* @author singgel 30 | * \* @created_at: 2019/3/31 下午12:29 31 | * \ 32 | */ 33 | 34 | @Slf4j 35 | public class HbaseSink extends RichSinkFunction implements CheckpointedFunction { 36 | 37 | private final JobConfig jobConfig; 38 | 39 | private HbaseUtil hbaseUtil; 40 | 41 | private long currentTime = System.currentTimeMillis(); 42 | 43 | /** 44 | * 在flink任务自动重试时,会先恢复state中的数据;如果是cancel掉flink任务,重新手动提交,则state会清空 45 | */ 46 | private transient ListState checkpointedState; 47 | 48 | private List nodes = new ArrayList<>(); 49 | 50 | private static ObjectMapper MAPPER = new ObjectMapper(); 51 | 52 | private StringBuilder sbLog = new StringBuilder(); 53 | 54 | public HbaseSink(JobConfig jobConfig) { 55 | this.jobConfig = jobConfig; 56 | } 57 | 58 | 59 | @Override 60 | public void open(Configuration parameters) throws Exception { 61 | super.open(parameters); 62 | this.hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig()); 63 | } 64 | 65 | /** 66 | * 在手动cancel和程序内部出错重试时都会触发close方法,在close方法中将nodes中的数据先flush,防止在两次写入之间,checkpoint点之前的数据丢失 67 | * 但会有数据重复,checkpoint点之后到发生故障时的数据会重复,如下示意图: 68 | *

 69 |      *                       flush
 70 |      *                        ^
 71 |      *                       /
 72 |      *            +--------------------------+
 73 |      * ------write------checkpoint-----------down----
 74 |      *                           +-----------+
 75 |      *                                 ^
 76 |      *                                /
 77 |      *                           will repeat
 78 |      * 
79 | *

80 | * 但对于写入具有幂等性的业务,数据重复写入不会影响结果 81 | * 82 | * @throws Exception 83 | */ 84 | @Override 85 | public void close() throws Exception { 86 | log.debug("execute sink close method"); 87 | super.close(); 88 | batchFlush(); 89 | if (this.hbaseUtil.getConnection() != null) { 90 | try { 91 | this.hbaseUtil.getConnection().close(); 92 | } catch (Exception e) { 93 | log.warn("connection close failed. error:{} ", e.getMessage()); 94 | } 95 | } 96 | } 97 | 98 | /** 99 | * 每条记录调用一次此方法 100 | * 101 | * @param node 102 | * @param context 103 | * @throws Exception 104 | */ 105 | @Override 106 | public void invoke(ObjectNode node, Context context) throws Exception { 107 | String partition = node.get("metadata").get("partition").asText(); 108 | String offset = node.get("metadata").get("offset").asText(); 109 | String value = node.get("value").asText(); 110 | log.debug("partition->{}|offset->{}|value->{}", partition, offset, value); 111 | nodes.add(node); 112 | if (nodes.size() >= jobConfig.getHbaseConfig().getBatchCount() || 113 | (System.currentTimeMillis() - currentTime > jobConfig.getHbaseConfig().getInterval() && nodes.size() > 0)) { 114 | batchFlush(); 115 | } 116 | } 117 | 118 | /** 119 | * 将{@link HbaseSink#nodes}中的数据批量写入 120 | * 121 | * @throws IOException 122 | */ 123 | private void batchFlush() throws IOException { 124 | long start = System.currentTimeMillis(); 125 | List puts = convertBatch(nodes); 126 | if (puts.size() == 0) { 127 | return; 128 | } 129 | long startPut = System.currentTimeMillis(); 130 | hbaseUtil.putBatchData(jobConfig.getTableName(), puts); 131 | long end = System.currentTimeMillis(); 132 | sbLog.append(String.format(" | batch_put(%d) cost %d ms", puts.size(), end - startPut)); 133 | sbLog.append(String.format(" | batch_total(%d) cost %d ms", puts.size(), end - start)); 134 | sbLog.append(String.format(" | per record cost %d ms", (end - start) / puts.size())); 135 | log.debug(sbLog.toString()); 136 | currentTime = System.currentTimeMillis(); 137 | sbLog = new StringBuilder(); 138 | nodes.clear(); 139 | checkpointedState.clear(); 140 | } 141 | 142 | /** 143 | * 批量处理 144 | * 145 | * @param objectNodes 一批数据 146 | * @return 返回批量的Put 147 | */ 148 | private List convertBatch(List objectNodes) throws IOException { 149 | Map> puts = new HashMap<>(objectNodes.size()); 150 | //存储每个需要join的表中这个批次的rowkey的值 151 | Map> joinKeys = new HashMap<>(objectNodes.size()); 152 | 153 | for (ObjectNode node : objectNodes) { 154 | Map keyValues = getKeyValues(node); 155 | 156 | //获取拼接的rowKey 157 | List rowKeyValues = new ArrayList<>(); 158 | jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e))); 159 | if (rowKeyValues.stream().anyMatch(Objects::isNull)) { 160 | //如果组合rowKey的字段中有null,则过滤掉此记录 161 | log.warn("columns which consist of rowKey has null value"); 162 | continue; 163 | } 164 | String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues); 165 | Put put = new Put(Bytes.toBytes(rowKey)); 166 | 167 | //获取这个joinTable表中这个批次所有需要join的key 168 | for (JoinTable joinTable : jobConfig.getJoinTables()) { 169 | 170 | joinKeys.compute(joinTable.getTableName(), (k, v) -> { 171 | //keyValues.get(joinTable.getJoinKey()的值有可能为null,需做空判断 172 | if (keyValues.get(joinTable.getJoinKey()) != null) { 173 | if (v == null) { 174 | v = new HashSet<>(); 175 | v.add(keyValues.get(joinTable.getJoinKey())); 176 | } else { 177 | v.add(keyValues.get(joinTable.getJoinKey())); 178 | } 179 | } 180 | return v; 181 | }); 182 | } 183 | 184 | 185 | //原始topic需要写入的列 186 | keyValues.forEach((k, v) -> { 187 | String family = k.split(":")[0]; 188 | String column = k.split(":")[1]; 189 | put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), v != null ? Bytes.toBytes(v) : null); 190 | }); 191 | 192 | puts.put(put, keyValues); 193 | } 194 | 195 | //当需要join时执行下面操作 196 | for (JoinTable joinTable : jobConfig.getJoinTables()) { 197 | //取出这个joinTable表中这个批次所有需要join的key 198 | Set keys = joinKeys.get(joinTable.getTableName()); 199 | List gets = new ArrayList<>(); 200 | //将key和result一一对应 201 | Map keyResults = new HashMap<>(keys.size()); 202 | 203 | //生成需要批量get的List 204 | keys.forEach(e -> { 205 | Get get = new Get(Bytes.toBytes(e)); 206 | joinTable.getColumnsMapping().forEach((k, v) -> { 207 | get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 208 | }); 209 | gets.add(get); 210 | }); 211 | 212 | 213 | long start = System.currentTimeMillis(); 214 | //执行批量get 215 | Result[] results = hbaseUtil.batchGet(joinTable.getTableName(), gets); 216 | for (Result result : results) { 217 | if (result != null) { 218 | keyResults.put(Bytes.toString(result.getRow()), result); 219 | } 220 | } 221 | long end = System.currentTimeMillis(); 222 | 223 | sbLog.append(String.format("| batch_get %s(%d) %d ms", joinTable.getTableName(), keys.size(), (end - start))); 224 | //对之前原始写入的每个put,获取这个表需要join的rowKey的result,然后将result中的值根据joinTable的配置添加到put的对应列中 225 | puts.forEach((put, keyValues) -> { 226 | Result result = keyResults.get(keyValues.get(joinTable.getJoinKey())); 227 | if (result != null) { 228 | joinTable.getColumnsMapping().forEach((k, v) -> { 229 | byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 230 | put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue); 231 | }); 232 | } 233 | }); 234 | } 235 | 236 | 237 | return new ArrayList<>(puts.keySet()); 238 | } 239 | 240 | private Put convert(ObjectNode node) throws IOException { 241 | 242 | Map keyValues = getKeyValues(node); 243 | 244 | //获取拼接的rowKey 245 | List rowKeyValues = new ArrayList<>(); 246 | 247 | jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e))); 248 | String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues); 249 | 250 | Put put = new Put(Bytes.toBytes(rowKey)); 251 | 252 | //原始topic需要写入的列 253 | keyValues.forEach((k, v) -> { 254 | String family = k.split(":")[0]; 255 | String column = k.split(":")[1]; 256 | put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(v)); 257 | }); 258 | 259 | //需要join的table 260 | for (JoinTable joinTable : jobConfig.getJoinTables()) { 261 | byte[] joinKey = Bytes.toBytes(keyValues.get(joinTable.getJoinKey())); 262 | Get get = new Get(joinKey); 263 | joinTable.getColumnsMapping().forEach((k, v) -> { 264 | get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 265 | }); 266 | //获取此rowKey所有的列值 267 | Result result = hbaseUtil.singleGet(joinTable.getTableName(), get); 268 | //如果result为空,则不需处理 269 | if (result != null) { 270 | joinTable.getColumnsMapping().forEach((k, v) -> { 271 | byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1])); 272 | put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue); 273 | }); 274 | } 275 | } 276 | return put; 277 | } 278 | 279 | /** 280 | * 根据配置中给定的列值对应关系,将每条消息解析成格式,key为配置中指定的列名(包含列簇) 281 | * 目前支持两种消息格式:CSV和JSON格式的字符串型数据, 282 | * 在处理消息时,kafka消息的key默认会被集成到value中, 对于CSV格式,kafka消息的key处在index=0的位置;对于JSON格式,kafka消息的key对应默认的kafka_key字段 283 | * 284 | * @param node flink接入的kafka消息 285 | * @return 返回字段名称对应的值 286 | */ 287 | private Map getKeyValues(ObjectNode node) { 288 | Map indexColumns = jobConfig.getIndexColumnMapping(); 289 | String key = node.get("key") == null ? "" : node.get("key").asText(); 290 | String value = node.get("value") == null ? "" : node.get("value").asText(); 291 | 292 | 293 | Map keyValues = new HashMap<>(8); 294 | 295 | ValueFormat valueFormat = jobConfig.getKafkaConfig().getValueFormat(); 296 | switch (valueFormat) { 297 | case CSV: 298 | //将key和value拼接起来,配置时kafka的key值作为下标的第0个 299 | String input = key + jobConfig.getKafkaConfig().getDelimiter() + value; 300 | String[] columnValues = StringUtils.splitPreserveAllTokens(input, jobConfig.getKafkaConfig().getDelimiter()); 301 | 302 | //将index对应的列值写入对应的列名下,列名包含了列簇名,形如:family:qualifier 303 | for (Map.Entry entry : indexColumns.entrySet()) { 304 | try { 305 | keyValues.put(entry.getValue(), columnValues[Integer.valueOf(entry.getKey())]); 306 | } catch (Exception e) { 307 | 308 | log.warn("index {} out of boundary.", entry.getKey(), e); 309 | } 310 | } 311 | 312 | break; 313 | case JSON: 314 | default: 315 | //将kafka的key加入node,统一处理 316 | try { 317 | 318 | ObjectNode jsonNode = (ObjectNode) MAPPER.readTree(value); 319 | 320 | jsonNode.put("kafka_key", key); 321 | 322 | //将配置中指定的列值写入对应的列名下,列名包含了列簇名,形如:family:qualifier 323 | indexColumns.forEach((k, v) -> { 324 | if (jsonNode.get(k) != null) { 325 | keyValues.put(v, jsonNode.get(k).asText()); 326 | } 327 | }); 328 | } catch (IOException e) { 329 | keyValues.clear(); 330 | indexColumns.forEach((k, v) -> keyValues.put(v, null)); 331 | String partition = node.get("metadata").get("partition").asText(); 332 | String offset = node.get("metadata").get("offset").asText(); 333 | String topic = node.get("metadata").get("topic").asText(); 334 | log.warn("this json record failed.topic->{},partition->{},offset->{},value->{}", topic, partition, offset, value); 335 | } 336 | break; 337 | } 338 | return keyValues; 339 | } 340 | 341 | /** 342 | * 执行频率和{@link FlinkRunner}中指定的checkpoint间隔一致 343 | * 344 | * @param context 345 | * @throws Exception 346 | */ 347 | @Override 348 | public void snapshotState(FunctionSnapshotContext context) throws Exception { 349 | for (ObjectNode element : nodes) { 350 | checkpointedState.add(element); 351 | } 352 | log.debug("execute snapshot at {}", System.currentTimeMillis()); 353 | } 354 | 355 | /** 356 | * 在程序内部出错重启时,如果调用了snapshotState方法,则会恢复checkpointedState中的数据,如果是手动cancel或重试几次失败后重新提交任务,此时 357 | * 的checkpointedState会是新的对象,里面没有数据 358 | * 359 | * @param context 360 | * @throws Exception 361 | */ 362 | @Override 363 | public void initializeState(FunctionInitializationContext context) throws Exception { 364 | ListStateDescriptor descriptor = 365 | new ListStateDescriptor<>( 366 | "hbase-sink-cp", 367 | TypeInformation.of(new TypeHint() { 368 | })); 369 | 370 | checkpointedState = context.getOperatorStateStore().getListState(descriptor); 371 | 372 | if (context.isRestored()) { 373 | for (ObjectNode element : checkpointedState.get()) { 374 | nodes.add(element); 375 | } 376 | } 377 | log.info("initialState {} record", nodes.size()); 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/common/CustomJsonDeserializationSchema.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.common; 2 | 3 | import org.apache.flink.api.common.typeinfo.TypeInformation; 4 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 5 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 6 | import org.apache.flink.streaming.util.serialization.KeyedDeserializationSchema; 7 | 8 | import java.io.IOException; 9 | 10 | import static org.apache.flink.api.java.typeutils.TypeExtractor.getForClass; 11 | 12 | /** 13 | * \* @author singgel 14 | * \* @created_at: 2019/3/24 下午2:22 15 | * \ 16 | */ 17 | public class CustomJsonDeserializationSchema implements KeyedDeserializationSchema { 18 | 19 | private final boolean includeMetadata; 20 | private ObjectMapper mapper; 21 | 22 | public CustomJsonDeserializationSchema(boolean includeMetadata) { 23 | this.includeMetadata = includeMetadata; 24 | } 25 | 26 | @Override 27 | public ObjectNode deserialize(byte[] messageKey, byte[] message, String topic, int partition, long offset) throws IOException { 28 | if (mapper == null) { 29 | mapper = new ObjectMapper(); 30 | } 31 | ObjectNode node = mapper.createObjectNode(); 32 | if (messageKey != null) { 33 | node.put("key", new String(messageKey, "utf-8")); 34 | } 35 | if (message != null) { 36 | node.put("value", new String(message, "utf-8")); 37 | } 38 | if (includeMetadata) { 39 | node.putObject("metadata") 40 | .put("offset", offset) 41 | .put("topic", topic) 42 | .put("partition", partition); 43 | } 44 | return node; 45 | } 46 | 47 | @Override 48 | public boolean isEndOfStream(ObjectNode nextElement) { 49 | return false; 50 | } 51 | 52 | @Override 53 | public TypeInformation getProducedType() { 54 | return getForClass(ObjectNode.class); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/common/HbaseUtil.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.common; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig; 4 | import lombok.extern.slf4j.Slf4j; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.hbase.HBaseConfiguration; 7 | import org.apache.hadoop.hbase.HColumnDescriptor; 8 | import org.apache.hadoop.hbase.HTableDescriptor; 9 | import org.apache.hadoop.hbase.TableName; 10 | import org.apache.hadoop.hbase.client.*; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | import java.io.IOException; 15 | import java.io.Serializable; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | 19 | /** 20 | * \* @author singgel 21 | * \* @created_at: 2019/3/24 下午1:48 22 | * \ 23 | */ 24 | @Slf4j 25 | public class HbaseUtil implements Serializable { 26 | 27 | 28 | private static Logger logger = LoggerFactory.getLogger(HbaseUtil.class); 29 | 30 | private Configuration configuration; 31 | private Connection connection; 32 | 33 | public HbaseUtil(HbaseConfig hbaseConfig) { 34 | this.configuration = HBaseConfiguration.create(); 35 | this.configuration.set("hbase.zookeeper.quorum", hbaseConfig.getZookerperQuorum()); 36 | this.configuration.set("hbase.zookeeper.property.clientPort", hbaseConfig.getPort()); 37 | this.configuration.set("zookeeper.znode.parent", hbaseConfig.getZookeeperZondeParent()); 38 | hbaseConfig.getOptionalProp().forEach((k, v) -> this.configuration.set(k, v)); 39 | try { 40 | connection = ConnectionFactory.createConnection(configuration); 41 | } catch (IOException e) { 42 | e.printStackTrace(); 43 | } 44 | } 45 | 46 | public Connection getConnection() { 47 | return connection; 48 | } 49 | 50 | /** 51 | * 获取某个rowKey的所有列簇所有列值 52 | * 53 | * @param tableName hbase表名 54 | * @param get 只指定了rowKey的get 55 | * @return 返回result 56 | */ 57 | public Result singleGet(String tableName, Get get) throws IOException { 58 | 59 | Result result = null; 60 | try (Table table = connection.getTable(TableName.valueOf(tableName))) { 61 | result = table.get(get); 62 | 63 | } catch (IOException e) { 64 | log.error("singleGet rowKey:{} get failed", new String(get.getRow()), e); 65 | throw e; 66 | } 67 | return result; 68 | } 69 | 70 | /** 71 | * 批量获取 72 | * 73 | * @param tableName 表名 74 | * @param gets get列表 75 | * @return 76 | */ 77 | public Result[] batchGet(String tableName, List gets) throws IOException { 78 | Result[] results = null; 79 | try (Table table = connection.getTable(TableName.valueOf(tableName))) { 80 | results = table.get(gets); 81 | 82 | } catch (IOException e) { 83 | logger.warn("batchGets get failed", e); 84 | throw e; 85 | } 86 | return results; 87 | } 88 | 89 | 90 | /** 91 | * 向hbase表插入数据 92 | * 93 | * @param tableName hbase表名 94 | * @param put 要插入的put,需指定列簇和列 95 | */ 96 | public void putData(String tableName, Put put) { 97 | System.out.println("begin put"); 98 | try (Table table = connection.getTable(TableName.valueOf(tableName))) { 99 | table.put(put); 100 | System.out.println("put success"); 101 | } catch (IOException e) { 102 | logger.warn("rowKey:{} put failed", new String(put.getRow()), e); 103 | } 104 | } 105 | 106 | /** 107 | * 向hbase表批量插入数据 108 | * 109 | * @param tableName hbase表名 110 | * @param puts 要插入的puts,需指定列簇和列 111 | */ 112 | public void putBatchData(String tableName, List puts) throws IOException { 113 | try (Table table = connection.getTable(TableName.valueOf(tableName))) { 114 | table.put(puts); 115 | } catch (IOException e) { 116 | log.error("put batch data failed",e); 117 | throw e; 118 | } 119 | } 120 | 121 | /** 122 | * 准备要写入的hbase表,如果表不存在则创建,并添加列簇,如果存在则添加不存在的列簇 123 | * 124 | * @param tableName hbase表名 125 | * @param families 写入的列簇 126 | * @throws IOException 127 | */ 128 | public void prepareTable(String tableName, Iterable families) throws IOException { 129 | try { 130 | HBaseAdmin admin = (HBaseAdmin) connection.getAdmin(); 131 | if (admin.tableExists(tableName)) { 132 | Table table = connection.getTable(TableName.valueOf(tableName)); 133 | HTableDescriptor hTableDescriptor = table.getTableDescriptor(); 134 | List existFamilies = new ArrayList<>(); 135 | List needAddedFamilies = new ArrayList<>(); 136 | for (HColumnDescriptor fdescriptor : hTableDescriptor.getColumnFamilies()) { 137 | existFamilies.add(fdescriptor.getNameAsString()); 138 | } 139 | for (String family : families) { 140 | if (!existFamilies.contains(family)) { 141 | needAddedFamilies.add(family); 142 | } 143 | } 144 | //当有需要新增的列簇时再disable table,增加列簇 145 | if (needAddedFamilies.size() > 0) { 146 | admin.disableTable(tableName); 147 | for (String family : needAddedFamilies) { 148 | admin.addColumn(tableName, new HColumnDescriptor(family)); 149 | } 150 | admin.enableTable(tableName); 151 | } 152 | } else { 153 | HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName)); 154 | for (String family : families) { 155 | hTableDescriptor.addFamily(new HColumnDescriptor(family)); 156 | } 157 | admin.createTable(hTableDescriptor); 158 | } 159 | admin.close(); 160 | connection.close(); 161 | } catch (IOException e) { 162 | log.error("prepare table failed! check the table and columnFamilies.",e); 163 | throw e; 164 | } 165 | 166 | 167 | } 168 | 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/common/JobConfigManager.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.common; 2 | 3 | import com.fasterxml.jackson.databind.ObjectMapper; 4 | import com.typesafe.config.ConfigFactory; 5 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.net.HttpURLConnection; 11 | import java.net.URL; 12 | 13 | /** 14 | * \* @author singgel 15 | * \* @created_at: 2019/3/24 下午5:38 16 | * \ 17 | */ 18 | public class JobConfigManager { 19 | 20 | private static final ObjectMapper MAPPER = new ObjectMapper(); 21 | private static final String URL = ConfigFactory.load().getConfig("apollo").getString("url"); 22 | 23 | /** 24 | * 从Apollo配置中心获取jobConfig的配置 25 | * 26 | * @param key 配置的key 27 | * @return JobConfig 28 | * @throws IOException 29 | */ 30 | public static JobConfig getConfigByKey(String key) throws Exception { 31 | 32 | java.net.URL url = new URL(String.format("%s/apollo/getConf?key=%s",URL, key)); 33 | HttpURLConnection con = (HttpURLConnection) url.openConnection(); 34 | 35 | con.setRequestMethod("GET"); 36 | int responseCode = con.getResponseCode(); 37 | System.out.println("\nSending 'GET' request to URLSTR : " + url); 38 | System.out.println("Response Code : " + responseCode); 39 | 40 | BufferedReader in = new BufferedReader( 41 | new InputStreamReader(con.getInputStream())); 42 | String inputLine; 43 | StringBuffer response = new StringBuffer(); 44 | 45 | while ((inputLine = in.readLine()) != null) { 46 | response.append(inputLine); 47 | } 48 | in.close(); 49 | 50 | //打印结果 51 | System.out.println(response.toString()); 52 | String jsonRet = MAPPER.readTree(response.toString()).get("data").asText(); 53 | JobConfig jobConfig = MAPPER.readValue(jsonRet, JobConfig.class); 54 | 55 | return jobConfig; 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/common/JoinTable.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.common; 2 | 3 | import java.io.Serializable; 4 | import java.util.Map; 5 | 6 | /** 7 | * \* @author singgel 8 | * \* @created_at: 2019/3/24 下午5:03 9 | * \ 10 | */ 11 | 12 | /** 13 | * 需要join的Hbase表,按照主表中的某列值作为此表中的rowKey,获取此表此rowKey的相关列值,插入到主表对应的列,完成join 14 | */ 15 | public class JoinTable implements Serializable{ 16 | 17 | /** 18 | * hbase 表名 19 | */ 20 | private String tableName; 21 | 22 | /** 23 | * 和此表RowKey相关联的主表列名 24 | */ 25 | private String joinKey; 26 | 27 | /** 28 | * 此表中列簇和列的对应关系 29 | */ 30 | 31 | /** 32 | * 此表中列和列簇及写入到主表中的列簇的对应关系,如: 33 | * key-> fromFamily:fromColumn 34 | * value -> toFamily:toColumn 35 | */ 36 | private Map columnsMapping; 37 | 38 | 39 | public String getTableName() { 40 | return tableName; 41 | } 42 | 43 | public void setTableName(String tableName) { 44 | this.tableName = tableName; 45 | } 46 | 47 | public String getJoinKey() { 48 | return joinKey; 49 | } 50 | 51 | public void setJoinKey(String joinKey) { 52 | this.joinKey = joinKey; 53 | } 54 | 55 | public Map getColumnsMapping() { 56 | return columnsMapping; 57 | } 58 | 59 | public void setColumnsMapping(Map columnsMapping) { 60 | this.columnsMapping = columnsMapping; 61 | } 62 | 63 | public JoinTable() { 64 | } 65 | 66 | public JoinTable(String tableName, String joinKey, Map columnsMapping) { 67 | this.tableName = tableName; 68 | this.joinKey = joinKey; 69 | this.columnsMapping = columnsMapping; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/common/ValueFormat.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.common; 2 | 3 | /** 4 | * kafka消息值的类型格式 5 | */ 6 | public enum ValueFormat { 7 | /** 8 | * CSV格式,以固定分隔符分割 9 | */ 10 | CSV, 11 | 12 | /** 13 | * ObjecNode的json格式 14 | */ 15 | JSON 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/config/HbaseConfig.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.config; 2 | 3 | import java.io.Serializable; 4 | import java.util.Map; 5 | 6 | /** 7 | * \* @author singgel 8 | * \* @created_at: 2019/3/28 上午9:29 9 | * \ 10 | */ 11 | public class HbaseConfig implements Serializable{ 12 | 13 | /** 14 | * zookeeper机器名,多个用逗号连接 15 | */ 16 | private String zookerperQuorum; 17 | 18 | /** 19 | * 端口,默认2181 20 | */ 21 | private String port = "2181"; 22 | 23 | /** 24 | * hbase在zookeeper节点的路径 25 | */ 26 | private String zookeeperZondeParent; 27 | 28 | /** 29 | * 其它非必需配置 30 | */ 31 | private Map optionalProp; 32 | 33 | /** 34 | * 批量写入时每批的数量 35 | */ 36 | private int batchCount; 37 | 38 | /** 39 | * 每批次的时间间隔,单位:毫秒 40 | */ 41 | private long interval; 42 | 43 | public HbaseConfig() { 44 | } 45 | 46 | public HbaseConfig(String zookerperQuorum, 47 | String port, 48 | String zookeeperZondeParent, 49 | int batchCount, 50 | long interval, 51 | Map optionalProp) { 52 | this.zookerperQuorum = zookerperQuorum; 53 | this.port = port; 54 | this.zookeeperZondeParent = zookeeperZondeParent; 55 | this.batchCount = batchCount; 56 | this.interval = interval; 57 | this.optionalProp =optionalProp; 58 | } 59 | 60 | public String getZookerperQuorum() { 61 | return zookerperQuorum; 62 | } 63 | 64 | public void setZookerperQuorum(String zookerperQuorum) { 65 | this.zookerperQuorum = zookerperQuorum; 66 | } 67 | 68 | public String getPort() { 69 | return port; 70 | } 71 | 72 | public void setPort(String port) { 73 | this.port = port; 74 | } 75 | 76 | public String getZookeeperZondeParent() { 77 | return zookeeperZondeParent; 78 | } 79 | 80 | public void setZookeeperZondeParent(String zookeeperZondeParent) { 81 | this.zookeeperZondeParent = zookeeperZondeParent; 82 | } 83 | 84 | public int getBatchCount() { 85 | return batchCount; 86 | } 87 | 88 | public void setBatchCount(int batchCount) { 89 | this.batchCount = batchCount; 90 | } 91 | 92 | public long getInterval() { 93 | return interval; 94 | } 95 | 96 | public void setInterval(long interval) { 97 | this.interval = interval; 98 | } 99 | 100 | public Map getOptionalProp() { 101 | return optionalProp; 102 | } 103 | 104 | public void setOptionalProp(Map optionalProp) { 105 | this.optionalProp = optionalProp; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/config/JobConfig.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.config; 2 | 3 | 4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable; 5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat; 6 | import org.apache.commons.lang3.StringUtils; 7 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonProcessingException; 8 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 9 | 10 | import java.io.Serializable; 11 | import java.util.*; 12 | import java.util.stream.Collectors; 13 | 14 | /** 15 | * \* @author singgel 16 | * \* @created_at: 2019/3/24 下午1:50 17 | * \ 18 | */ 19 | public class JobConfig implements Serializable { 20 | 21 | private static final ObjectMapper MAPPER = new ObjectMapper(); 22 | 23 | public static final long CHECKPOINT_INTERVAR = 10000L; 24 | 25 | /** 26 | * kafka消息(整合key和value的值)按分隔符分隔后index和列信息的对应关系 27 | */ 28 | private Map indexColumnMapping = new HashMap<>(); 29 | 30 | 31 | /** 32 | * 多列组合成rowKey时指定的连接字符 33 | */ 34 | private String rowKeyDelimiter; 35 | 36 | /** 37 | * 组成rowKey的列 38 | */ 39 | private List rowKeyColumns; 40 | 41 | /** 42 | * 要写入hbase表的表名,称为"主表" 43 | */ 44 | private String tableName; 45 | 46 | /** 47 | * kafka源配置 48 | */ 49 | private KafkaConfig kafkaConfig; 50 | 51 | 52 | /** 53 | * hbase基本配置 54 | */ 55 | private HbaseConfig hbaseConfig; 56 | 57 | /** 58 | * flink job名字,同apollo配置的key 59 | */ 60 | private String jobName; 61 | 62 | /** 63 | * 通过实时平台前端界面{@link \http://10.10.20.81:7878/realtime/platform/jobList} 启动flink任务时指定的并发数 64 | */ 65 | private int parallelism; 66 | 67 | /** 68 | * 通过实时平台前端界面{@link \http://10.10.20.81:7878/realtime/platform/jobList} 启动flink任务时指定的jar包名称 69 | */ 70 | private String jarName; 71 | 72 | public JobConfig() { 73 | } 74 | 75 | public JobConfig(Map indexColumnMapping, 76 | String rowKeyDelimiter, 77 | List rowKeyColumns, 78 | String tableName, 79 | KafkaConfig kafkaConfig, 80 | HbaseConfig hbaseConfig, 81 | List joinTables, 82 | String jobName, 83 | int parallelism, 84 | String jarName) { 85 | this.indexColumnMapping = indexColumnMapping; 86 | this.rowKeyDelimiter = rowKeyDelimiter; 87 | this.rowKeyColumns = rowKeyColumns; 88 | this.tableName = tableName; 89 | this.kafkaConfig = kafkaConfig; 90 | this.hbaseConfig = hbaseConfig; 91 | this.joinTables = joinTables; 92 | this.jobName = jobName; 93 | this.parallelism = parallelism; 94 | this.jarName = jarName; 95 | } 96 | 97 | 98 | /** 99 | * 校验: 100 | * 1. joinTable中的joinKey需要在主表的列中存在 101 | * 2. indexColumnMapping的values不能重复 102 | * 3. joinTable中需要写入到mainTable中的列不能与mainTable原有的列重合 103 | * 4. 当valueFormat=CSV时,delimiter必须要指定 104 | * 5. hbase批量写入间隔要不能大于flink任务checkpoint的间隔 105 | */ 106 | public void validate() throws IllegalArgumentException { 107 | for (JoinTable joinTable : this.joinTables) { 108 | if (indexColumnMapping.values().stream().noneMatch(e -> e.contains(joinTable.getJoinKey()))) { 109 | throw new IllegalArgumentException(String.format("%s does not exist in the columns of main table %s", joinTable.getJoinKey(), tableName)); 110 | } 111 | } 112 | if (indexColumnMapping.keySet().size() != indexColumnMapping.values().size()) { 113 | throw new IllegalArgumentException(String.format("the column in a family must not be duplicate")); 114 | } 115 | for (JoinTable joinTable : joinTables) { 116 | if (joinTable.getColumnsMapping().values().stream().anyMatch(e -> indexColumnMapping.values().contains(e))) { 117 | throw new IllegalArgumentException(String.format("some column in joinTable:%s has existed in mainTable:%s. Please check!", joinTable, tableName)); 118 | } 119 | } 120 | if (kafkaConfig.getValueFormat() == ValueFormat.CSV && StringUtils.isEmpty(kafkaConfig.getDelimiter())) { 121 | throw new IllegalArgumentException(String.format("the delimiter must be given when the valueFormat is CSV")); 122 | } 123 | if(hbaseConfig.getInterval()>CHECKPOINT_INTERVAR){ 124 | hbaseConfig.setInterval(CHECKPOINT_INTERVAR); 125 | } 126 | 127 | } 128 | 129 | /** 130 | * 获取mainTable需要写入的列簇 131 | * 132 | * @return 列簇的集合 133 | */ 134 | public Set families() { 135 | Set mainTableFamilies = this.indexColumnMapping.values().stream().map(e -> e.split(":")[0]).collect(Collectors.toSet()); 136 | for (JoinTable joinTable : joinTables) { 137 | mainTableFamilies.addAll(joinTable.getColumnsMapping().values().stream().map(e -> e.split(":")[0]).collect(Collectors.toSet())); 138 | } 139 | return mainTableFamilies; 140 | } 141 | 142 | 143 | public Map getIndexColumnMapping() { 144 | return indexColumnMapping; 145 | } 146 | 147 | public void setIndexColumnMapping(Map indexColumnMapping) { 148 | this.indexColumnMapping = indexColumnMapping; 149 | } 150 | 151 | 152 | private List joinTables = new ArrayList<>(); 153 | 154 | public String getRowKeyDelimiter() { 155 | return rowKeyDelimiter; 156 | } 157 | 158 | public void setRowKeyDelimiter(String rowKeyDelimiter) { 159 | this.rowKeyDelimiter = rowKeyDelimiter; 160 | } 161 | 162 | public List getRowKeyColumns() { 163 | return rowKeyColumns; 164 | } 165 | 166 | public List getJoinTables() { 167 | return joinTables; 168 | } 169 | 170 | public String getTableName() { 171 | 172 | return tableName; 173 | } 174 | 175 | public KafkaConfig getKafkaConfig() { 176 | return kafkaConfig; 177 | } 178 | 179 | public void setKafkaConfig(KafkaConfig kafkaConfig) { 180 | this.kafkaConfig = kafkaConfig; 181 | } 182 | 183 | public HbaseConfig getHbaseConfig() { 184 | return hbaseConfig; 185 | } 186 | 187 | public void setHbaseConfig(HbaseConfig hbaseConfig) { 188 | this.hbaseConfig = hbaseConfig; 189 | } 190 | 191 | public void setTableName(String tableName) { 192 | this.tableName = tableName; 193 | } 194 | 195 | public void setRowKeyColumns(List rowKeyColumns) { 196 | this.rowKeyColumns = rowKeyColumns; 197 | } 198 | 199 | public void setJoinTables(List joinTables) { 200 | this.joinTables = joinTables; 201 | } 202 | 203 | public String getJobName() { 204 | return jobName; 205 | } 206 | 207 | public void setJobName(String jobName) { 208 | this.jobName = jobName; 209 | } 210 | 211 | public int getParallelism() { 212 | return parallelism; 213 | } 214 | 215 | public void setParallelism(int parallelism) { 216 | this.parallelism = parallelism; 217 | } 218 | 219 | public String getJarName() { 220 | return jarName; 221 | } 222 | 223 | public void setJarName(String jarName) { 224 | this.jarName = jarName; 225 | } 226 | 227 | public static long getCheckpointIntervar() { 228 | return CHECKPOINT_INTERVAR; 229 | } 230 | 231 | @Override 232 | public String toString() { 233 | String json = null; 234 | try { 235 | json = MAPPER.writeValueAsString(this); 236 | } catch (JsonProcessingException e) { 237 | e.printStackTrace(); 238 | } 239 | return json; 240 | } 241 | 242 | } 243 | -------------------------------------------------------------------------------- /src/main/java/com/singgel/bigdata/flinksinkhbase/config/KafkaConfig.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.flinksinkhbase.config; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat; 4 | 5 | import java.io.Serializable; 6 | import java.util.Map; 7 | import java.util.Properties; 8 | 9 | /** 10 | * \* @author singgel 11 | * \* @created_at: 2019/3/24 下午5:21 12 | * \ 13 | */ 14 | public class KafkaConfig implements Serializable { 15 | 16 | /** 17 | * 数据源kafka的连接 18 | */ 19 | private String bootstrapServers; 20 | 21 | /** 22 | * 数据源的topic 23 | */ 24 | private String topic; 25 | 26 | /** 27 | * 消费topic的groupId 28 | */ 29 | private String groupId; 30 | 31 | /** 32 | * kafka value的类型,目前支持:CSV和JSON 33 | */ 34 | private ValueFormat valueFormat; 35 | 36 | /** 37 | * kafka消息value的分割符,当valueFormat为CSV格式时必须指定 38 | */ 39 | private String delimiter; 40 | 41 | /** 42 | * 其它配置 43 | */ 44 | private Map optionalProps; 45 | 46 | 47 | public KafkaConfig() { 48 | } 49 | 50 | 51 | public KafkaConfig(String bootstrapServers, String topic, String groupId,ValueFormat valueFormat, String splitor, Map optionalProps) { 52 | this.bootstrapServers = bootstrapServers; 53 | this.topic = topic; 54 | this.groupId = groupId; 55 | this.valueFormat = valueFormat; 56 | this.delimiter = splitor; 57 | this.optionalProps = optionalProps; 58 | } 59 | 60 | public String getBootstrapServers() { 61 | return bootstrapServers; 62 | } 63 | 64 | public void setBootstrapServers(String bootstrapServers) { 65 | this.bootstrapServers = bootstrapServers; 66 | } 67 | 68 | public String getTopic() { 69 | return topic; 70 | } 71 | 72 | public void setTopic(String topic) { 73 | this.topic = topic; 74 | } 75 | 76 | public String getGroupId() { 77 | return groupId; 78 | } 79 | 80 | public void setGroupId(String groupId) { 81 | this.groupId = groupId; 82 | } 83 | 84 | public ValueFormat getValueFormat() { 85 | return valueFormat; 86 | } 87 | 88 | public void setValueFormat(ValueFormat valueFormat) { 89 | this.valueFormat = valueFormat; 90 | } 91 | 92 | public String getDelimiter() { 93 | return delimiter; 94 | } 95 | 96 | public void setDelimiter(String delimiter) { 97 | this.delimiter = delimiter; 98 | } 99 | 100 | public Map getOptionalProps() { 101 | return optionalProps; 102 | } 103 | 104 | public void setOptionalProps(Map optionalProps) { 105 | this.optionalProps = optionalProps; 106 | } 107 | 108 | 109 | public Properties kafkaProps() { 110 | Properties props = new Properties(); 111 | props.setProperty("bootstrap.servers", this.bootstrapServers); 112 | props.setProperty("group.id", this.groupId); 113 | optionalProps.forEach(props::setProperty); 114 | return props; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | apollo { 2 | url = "http://singgel:8080" 3 | } -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | %d{yyyy-MM-dd HH:mm:ss.SSS}|%level|%logger{1}|%msg%n 9 | 10 | UTF-8 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/test/java/com/singgel/bigdata/recommend/HbaseByteTest.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.recommend; 2 | 3 | import junit.framework.TestCase; 4 | import org.apache.hadoop.hbase.util.Bytes; 5 | 6 | /** 7 | * \* @author singgel 8 | * \* @created_at: 2019/3/28 上午10:37 9 | * \ 10 | */ 11 | public class HbaseByteTest extends TestCase{ 12 | 13 | public void testByte(){ 14 | String userIdStr = "124235435235.4"; 15 | Long userIdLong = 124235435235L; 16 | 17 | 18 | byte[] strByte = Bytes.toBytes(userIdStr); 19 | byte[] longByte = Bytes.toBytes(userIdLong); 20 | 21 | System.out.println(Bytes.toString(strByte)); 22 | 23 | //long型写入的,读成string型,会乱码 24 | System.out.println(Bytes.toString(longByte)); 25 | 26 | //string型写入的,读成long型,结果不对 27 | System.out.println(Bytes.toLong(strByte)); 28 | 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/test/java/com/singgel/bigdata/recommend/HbaseGetTest.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.recommend; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig; 4 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil; 5 | import junit.framework.TestCase; 6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 7 | import org.apache.hadoop.hbase.client.Get; 8 | import org.apache.hadoop.hbase.util.Bytes; 9 | 10 | import java.io.IOException; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | /** 15 | * \* @author singgel 16 | * \* @created_at: 2019/4/2 下午5:33 17 | * \ 18 | */ 19 | public class HbaseGetTest extends TestCase { 20 | 21 | public void testHbaseGet() throws IOException { 22 | 23 | String user = "{\n" + 24 | " \"basic:pagerank\": \"basic:pagerank\",\n" + 25 | " \"basic:country\": \"basic:country\",\n" + 26 | " \"basic:province\": \"basic:province\",\n" + 27 | " \"basic:city\": \"basic:city\",\n" + 28 | " \"basic:mobile\": \"basic:mobile\",\n" + 29 | " \"basic:follower_cluster\": \"basic:follower_cluster\",\n" + 30 | " \"basic:quality_cluster\": \"basic:quality_cluster\",\n" + 31 | " \"basic:symbol_cluster\": \"basic:symbol_cluster\",\n" + 32 | " \"basic:topic_cluster\": \"basic:topic_cluster\",\n" + 33 | " \"basic:stock_click7\": \"basic:stock_click7\",\n" + 34 | " \"basic:stock_show7\": \"basic:stock_show7\",\n" + 35 | " \"basic:stock_click30\": \"basic:stock_click30\",\n" + 36 | " \"basic:stock_show30\": \"basic:stock_show30\",\n" + 37 | " \"basic:symbol_page_enter\": \"basic:symbol_page_enter\",\n" + 38 | " \"basic:symbol_new_status\": \"basic:symbol_new_status\",\n" + 39 | " \"basic:symbol_hot\": \"basic:symbol_hot\",\n" + 40 | " \"basic:symbol_finance\": \"basic:symbol_finance\",\n" + 41 | " \"basic:symbol_news\": \"basic:symbol_news\",\n" + 42 | " \"basic:symbol_notice\": \"basic:symbol_notice\",\n" + 43 | " \"basic:symbol_general\": \"basic:symbol_general\",\n" + 44 | " \"basic:symbol_page_view\": \"basic:symbol_page_view\",\n" + 45 | " \"basic:symbol_page_origin\": \"basic:symbol_page_origin\",\n" + 46 | " \"basic:attention_mark\": \"basic:attention_mark\",\n" + 47 | " \"basic:rebalance_num\": \"basic:rebalance_num\",\n" + 48 | " \"basic:topic_personal_short_click\": \"basic:topic_personal_short_click\",\n" + 49 | " \"basic:topic_personal_short_show\": \"basic:topic_personal_short_show\",\n" + 50 | " \"basic:topic_personal_long_click\": \"basic:topic_personal_long_click\",\n" + 51 | " \"basic:topic_personal_long_show\": \"basic:topic_personal_long_show\",\n" + 52 | " \"basic:dislike_1st\": \"basic:dislike_1st\",\n" + 53 | " \"basic:dislike_2st\": \"basic:dislike_2st\",\n" + 54 | " \"basic:dislike_3st\": \"basic:dislike_3st\",\n" + 55 | " \"basic:dislike_4st\": \"basic:dislike_4st\",\n" + 56 | " \"basic:dislike_5st\": \"basic:dislike_5st\",\n" + 57 | " \"basic:familar_1st\": \"basic:familar_1st\",\n" + 58 | " \"basic:familar_2st\": \"basic:familar_2st\",\n" + 59 | " \"basic:familar_3st\": \"basic:familar_3st\",\n" + 60 | " \"basic:familar_4st\": \"basic:familar_4st\",\n" + 61 | " \"basic:familar_5st\": \"basic:familar_5st\",\n" + 62 | " \"basic:like_1st\": \"basic:like_1st\",\n" + 63 | " \"basic:like_2st\": \"basic:like_2st\",\n" + 64 | " \"basic:like_3st\": \"basic:like_3st\",\n" + 65 | " \"basic:like_4st\": \"basic:like_4st\",\n" + 66 | " \"basic:like_5st\": \"basic:like_5st\",\n" + 67 | " \"basic:unfamilar_1st\": \"basic:unfamilar_1st\",\n" + 68 | " \"basic:unfamilar_2st\": \"basic:unfamilar_2st\",\n" + 69 | " \"basic:unfamilar_3st\": \"basic:unfamilar_3st\",\n" + 70 | " \"basic:unfamilar_4st\": \"basic:unfamilar_4st\",\n" + 71 | " \"basic:unfamilar_5st\": \"basic:unfamilar_5st\",\n" + 72 | " \"basic:headline_down_cnt\": \"basic:headline_down_cnt\",\n" + 73 | " \"basic:headline_up_cnt\": \"basic:headline_up_cnt\",\n" + 74 | " \"basic:optional_cnt\": \"basic:optional_cnt\",\n" + 75 | " \"basic:dynamic_cnt\": \"basic:dynamic_cnt\",\n" + 76 | " \"basic:quotation_cnt\": \"basic:quotation_cnt\",\n" + 77 | " \"basic:base_rate\": \"basic:base_rate\",\n" + 78 | " \"basic:mark_gegu_enter\": \"basic:mark_gegu_enter\",\n" + 79 | " \"basic:mark_share_sum\": \"basic:mark_share_sum\",\n" + 80 | " \"basic:mark_head_dislike_sum\": \"basic:mark_head_dislike_sum\",\n" + 81 | " \"basic:mark_status_post_user_sum\": \"basic:mark_status_post_user_sum\",\n" + 82 | " \"basic:mark_search_sum\": \"basic:mark_search_sum\",\n" + 83 | " \"basic:mark_debate_post_user_num\": \"basic:mark_debate_post_user_num\",\n" + 84 | " \"basic:author_click_week\": \"basic:author_click_week\",\n" + 85 | " \"basic:author_show_week\": \"basic:author_show_week\",\n" + 86 | " \"basic:author_click_month\": \"basic:author_click_month\",\n" + 87 | " \"basic:author_show_month\": \"basic:author_show_month\"\n" + 88 | " }"; 89 | 90 | ObjectMapper mapper = new ObjectMapper(); 91 | Map map = mapper.readValue(user, Map.class); 92 | 93 | String zookeeperQuorum = "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com"; 94 | HbaseConfig hbaseConfig = new HbaseConfig(zookeeperQuorum, "2181", "/hbase-unsecure", 1, 0L, new HashMap<>()); 95 | 96 | HbaseUtil hbaseUtil = new HbaseUtil(hbaseConfig); 97 | String[] uids = {"3148682933", "3188053557", "2912663770", "3227054543", "1492133910", "1275730031"}; 98 | String[] statusIds = {"124616652","124650145","124458448","124628342","124386412","124382379","124303730","124479145","124580988","124331284"}; 99 | 100 | // long start1 = System.currentTimeMillis(); 101 | // for (String uid : uids) { 102 | // Get get = new Get(Bytes.toBytes(uid)); 103 | // map.keySet().forEach(e -> { 104 | // get.addFamily(Bytes.toBytes(e.split(":")[0])); 105 | // }); 106 | // hbaseUtil.singleGet("user_feature", get); 107 | // } 108 | // long start2 = System.currentTimeMillis(); 109 | // for (String uid : uids) { 110 | // Get get = new Get(Bytes.toBytes(uid)); 111 | // map.keySet().forEach(e -> { 112 | // get.addColumn(Bytes.toBytes(e.split(":")[0]), Bytes.toBytes(e.split(":")[1])); 113 | // }); 114 | // hbaseUtil.singleGet("user_feature", get); 115 | // } 116 | long start3 = System.currentTimeMillis(); 117 | // for (String uid : uids) { 118 | // hbaseUtil.singleGet("user_feature", new Get(Bytes.toBytes(uid))); 119 | // } 120 | 121 | for (String sid : statusIds) { 122 | hbaseUtil.singleGet("status_feature_string", new Get(Bytes.toBytes(sid))); 123 | } 124 | 125 | long start4 = System.currentTimeMillis(); 126 | 127 | // System.out.println(String.format("get with given families cost: %d ms", start2 - start1)); 128 | // System.out.println(String.format("get with given columns cost: %d ms", start3 - start2)); 129 | System.out.println(String.format("get with no given columns cost: %d ms", start4 - start3)); 130 | 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/test/java/com/singgel/bigdata/recommend/JobConfigTest.java: -------------------------------------------------------------------------------- 1 | package com.singgel.bigdata.recommend; 2 | 3 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil; 4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable; 5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat; 6 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig; 7 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig; 8 | import com.singgel.bigdata.flinksinkhbase.config.KafkaConfig; 9 | import junit.framework.Assert; 10 | import junit.framework.TestCase; 11 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper; 12 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.JsonNodeFactory; 13 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode; 14 | import org.junit.Test; 15 | 16 | import java.io.IOException; 17 | import java.util.ArrayList; 18 | import java.util.HashMap; 19 | import java.util.List; 20 | import java.util.Map; 21 | 22 | /** 23 | * \* @author singgel 24 | * \* @created_at: 2019/3/25 上午11:40 25 | * \ 26 | */ 27 | public class JobConfigTest extends TestCase { 28 | 29 | @Test 30 | public void testJobConfig() throws IOException { 31 | 32 | ObjectMapper mapper = new ObjectMapper(); 33 | 34 | String input = "3521088992|1553845345473_1420|16|USER_NEW_STATUS|3|124029277|1|33|3|25|2|0|0|0.0|0.0|200.0|0.0|0.0|0.0|-1|0.0||223.104.4.109|移动|中国||0.0|timeline_status_tf_rerank_v1|1049|0.0|5747931945392156544|0.3354492783546448||4487|19264|0|0|0|0|3|#|EWC=3-GOOGL=1-RUSS=1-JEQ=1-ENZL=1-BA=1|n,n,0,0,359.0,334.0||330.61522521972654|47=20-42=19-55=14-24=5-58=5-62=4-28=2-64=2-66=1-57=1-25=1-48=1-61=1#1109218013=3-8806901933=3-9699237903=2-9383578508=2-7807472003=2-9725962662=2-1978838521=2-1558983019=2-9139068670=2-3826938159=2-5124430882=2-1077346177=1-1683324140=1-1383227981=1-3216693756=1-9688940470=1-1697559028=1-1703783922=1-6028781397=1-1250983240=1#=4-SZ002430=2-EWC=1-HUAW=1-EWY=1-SZ300436=1-BA=1#94_3=10-54_3=6-319_3=5-80_3=5-95_3=4-182_3=4-12_3=3-343_3=2-308_3=2-218_3=2-64_3=2-337_3=2-224_3=2-260_3=2-139_3=1-116_3=1-137_3=1-76_3=1-157_3=1-78_3=1#中国=3-飞机=2-成本=2-总理=2-合作=2-市场=2-技>术=2-季报=2-板块=2-状态=1-工业股=1-中美=1-金叉=1-牛市=1-妈妈=1-谈判=1-爸爸=1-新西兰=1-上证指数=1-工作=1|高手|"; 35 | 36 | Map columnsConfig = new HashMap<>(); 37 | columnsConfig.put("0", "basic:time"); 38 | columnsConfig.put("1", "basic:userId"); 39 | columnsConfig.put("2", "basic:sessionId"); 40 | columnsConfig.put("3", "basic:testMissionId"); 41 | columnsConfig.put("4", "basic:strategeName"); 42 | columnsConfig.put("5", "basic:statusType"); 43 | columnsConfig.put("6", "basic:statusId"); 44 | columnsConfig.put("7", "basic:position"); 45 | columnsConfig.put("8", "basic:likeCount"); 46 | columnsConfig.put("9", "basic:retweetCount"); 47 | columnsConfig.put("10", "basic:replyCount"); 48 | columnsConfig.put("20", "basic:tag"); 49 | columnsConfig.put("22", "basic:stockSymbol"); 50 | columnsConfig.put("31", "basic:randomId"); 51 | columnsConfig.put("33", "basic:quoteString"); 52 | columnsConfig.put("41", "basic:contextInfo"); 53 | 54 | 55 | Map userFeatureColumnMapping = new HashMap<>(); 56 | userFeatureColumnMapping.put("basic:pagerank", "basic:pagerank"); 57 | userFeatureColumnMapping.put("basic:country", "basic:country"); 58 | userFeatureColumnMapping.put("basic:province", "basic:province"); 59 | userFeatureColumnMapping.put("basic:city", "basic:city"); 60 | userFeatureColumnMapping.put("basic:mobile", "basic:mobile"); 61 | userFeatureColumnMapping.put("basic:follow_cluster", "basic:follow_cluster"); 62 | userFeatureColumnMapping.put("basic:quality_cluster", "basic:quality_cluster"); 63 | userFeatureColumnMapping.put("basic:symbol_cluster", "basic:symbol_cluster"); 64 | 65 | JoinTable userFeature = new JoinTable("user_feature", "basic:userId", userFeatureColumnMapping); 66 | 67 | Map statusFeatureColumnMapping = new HashMap<>(); 68 | 69 | statusFeatureColumnMapping.put("basic:user_id", "basic:user_id"); 70 | statusFeatureColumnMapping.put("basic:symbol_id", "basic:symbol_id"); 71 | statusFeatureColumnMapping.put("basic:created_at", "basic:created_at"); 72 | statusFeatureColumnMapping.put("basic:source", "basic:source"); 73 | statusFeatureColumnMapping.put("basic:retweet_status_id", "basic:retweet_status_id"); 74 | statusFeatureColumnMapping.put("basic:paid_mention_user_id", "basic:paid_mention_user_id"); 75 | statusFeatureColumnMapping.put("basic:retweet_user_id", "basic:retweet_user_id"); 76 | statusFeatureColumnMapping.put("basic:retweet_symbol_id", "basic:retweet_symbol_id"); 77 | statusFeatureColumnMapping.put("basic:truncated", "basic:truncated"); 78 | statusFeatureColumnMapping.put("basic:flags", "basic:flags"); 79 | statusFeatureColumnMapping.put("basic:expired_at", "basic:expired_at"); 80 | statusFeatureColumnMapping.put("basic:title_length", "basic:title_length"); 81 | statusFeatureColumnMapping.put("basic:title_hash", "basic:title_hash"); 82 | 83 | JoinTable statusFeature = new JoinTable("status_feature_string", "basic:statusId", statusFeatureColumnMapping); 84 | 85 | List joinTables = new ArrayList<>(); 86 | joinTables.add(userFeature); 87 | joinTables.add(statusFeature); 88 | 89 | String bootstrtapServers = "localhost:9092"; 90 | String topic = "recommend2.statistics"; 91 | String groupId = "flink_recommend2_statistic_test"; 92 | ValueFormat valueFormat = ValueFormat.CSV; 93 | String delimiter = "|"; 94 | KafkaConfig kafkaConfig = new KafkaConfig(bootstrtapServers, topic, groupId, valueFormat, delimiter, new HashMap<>()); 95 | String zookeeperQuorum = "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com"; 96 | HbaseConfig hbaseConfig = new HbaseConfig(zookeeperQuorum, "2181", "/hbase-unsecure", 1, 0L, new HashMap<>()); 97 | 98 | String rowKeyDelimiter = "#"; 99 | List rowKeyColumns = new ArrayList<>(); 100 | rowKeyColumns.add("basic:userId"); 101 | rowKeyColumns.add("basic:statusId"); 102 | String tableName = "test"; 103 | String jobName = "recommend_feature_hbase"; 104 | JobConfig jobConfig = new JobConfig(columnsConfig, rowKeyDelimiter, rowKeyColumns, tableName, kafkaConfig, hbaseConfig, joinTables, jobName, 2, ""); 105 | 106 | String jobConfigJosn = jobConfig.toString(); 107 | 108 | String expected = "{\"indexColumnMapping\":{\"22\":\"basic:stockSymbol\",\"33\":\"basic:quoteString\",\"0\":\"basic:time\",\"1\":\"basic:userId\",\"2\":\"basic:sessionId\",\"3\":\"basic:testMissionId\",\"4\":\"basic:strategeName\",\"5\":\"basic:statusType\",\"6\":\"basic:statusId\",\"7\":\"basic:position\",\"8\":\"basic:likeCount\",\"9\":\"basic:retweetCount\",\"41\":\"basic:contextInfo\",\"20\":\"basic:tag\",\"31\":\"basic:randomId\",\"10\":\"basic:replyCount\"},\"rowKeyDelimiter\":\"#\",\"rowKeyColumns\":[\"basic:userId\",\"basic:statusId\"],\"tableName\":\"test\",\"kafkaConfig\":{\"bootstrapServers\":\"localhost:9092\",\"topic\":\"recommend2.statistics\",\"groupId\":\"flink_recommend2_statistic_test\",\"valueFormat\":\"CSV\",\"delimiter\":\"|\",\"optionalProps\":{}},\"hbaseConfig\":{\"zookerperQuorum\":\"singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com\",\"port\":\"2181\",\"zookeeperZondeParent\":\"/hbase-unsecure\",\"optionalProp\":{},\"batchCount\":1,\"interval\":0},\"jobName\":\"recommend_feature_hbase\",\"parallelism\":2,\"jarName\":\"\",\"joinTables\":[{\"tableName\":\"user_feature\",\"joinKey\":\"basic:userId\",\"columnsMapping\":{\"basic:pagerank\":\"basic:pagerank\",\"basic:city\":\"basic:city\",\"basic:symbol_cluster\":\"basic:symbol_cluster\",\"basic:country\":\"basic:country\",\"basic:follow_cluster\":\"basic:follow_cluster\",\"basic:quality_cluster\":\"basic:quality_cluster\",\"basic:mobile\":\"basic:mobile\",\"basic:province\":\"basic:province\"}},{\"tableName\":\"status_feature_string\",\"joinKey\":\"basic:statusId\",\"columnsMapping\":{\"basic:source\":\"basic:source\",\"basic:title_hash\":\"basic:title_hash\",\"basic:symbol_id\":\"basic:symbol_id\",\"basic:retweet_user_id\":\"basic:retweet_user_id\",\"basic:user_id\":\"basic:user_id\",\"basic:title_length\":\"basic:title_length\",\"basic:retweet_status_id\":\"basic:retweet_status_id\",\"basic:flags\":\"basic:flags\",\"basic:paid_mention_user_id\":\"basic:paid_mention_user_id\",\"basic:created_at\":\"basic:created_at\",\"basic:retweet_symbol_id\":\"basic:retweet_symbol_id\",\"basic:expired_at\":\"basic:expired_at\",\"basic:truncated\":\"basic:truncated\"}}]}"; 109 | 110 | 111 | Assert.assertEquals(expected, jobConfigJosn); 112 | 113 | JobConfig reJobConfig = mapper.readValue(jobConfig.toString(), JobConfig.class); 114 | reJobConfig.validate(); 115 | HbaseUtil hbaseUtil = new HbaseUtil(reJobConfig.getHbaseConfig()); 116 | hbaseUtil.prepareTable(reJobConfig.getTableName(), reJobConfig.families()); 117 | 118 | Assert.assertEquals("#", reJobConfig.getRowKeyDelimiter()); 119 | Assert.assertEquals("[basic]", reJobConfig.families().toString()); 120 | 121 | ObjectNode node = new ObjectNode(JsonNodeFactory.instance); 122 | node.put("key", "2019-03-27 09:23:00"); 123 | node.put("value", input); 124 | 125 | } 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n 10 | 11 | 12 | 13 | 14 | 15 | 16 | ${LOG_HOME}/TestWeb.log.%d{yyyy-MM-dd}.log 17 | 18 | 30 19 | 20 | 21 | 22 | %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n 23 | 24 | 25 | 26 | 10MB 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | --------------------------------------------------------------------------------