├── README.md
├── pom.xml
└── src
    ├── main
        ├── java
        │   └── com
        │   │   └── singgel
        │   │       └── bigdata
        │   │           └── flinksinkhbase
        │   │               ├── FlinkRunner.java
        │   │               ├── HbaseSink.java
        │   │               ├── common
        │   │                   ├── CustomJsonDeserializationSchema.java
        │   │                   ├── HbaseUtil.java
        │   │                   ├── JobConfigManager.java
        │   │                   ├── JoinTable.java
        │   │                   └── ValueFormat.java
        │   │               └── config
        │   │                   ├── HbaseConfig.java
        │   │                   ├── JobConfig.java
        │   │                   └── KafkaConfig.java
        └── resources
        │   ├── application.conf
        │   └── logback.xml
    └── test
        ├── java
            └── com
            │   └── singgel
            │       └── bigdata
            │           └── recommend
            │               ├── HbaseByteTest.java
            │               ├── HbaseGetTest.java
            │               └── JobConfigTest.java
        └── resources
            └── logback.xml


/README.md:
--------------------------------------------------------------------------------
  1 | # flink-kafka-hbase
  2 | 功能：实现kafka消息实时落地hbase，支持csv/json字符串两种格式的消息，支持自定义组合rowkey,列簇和列名，支持按照kafka消息流中不同字段join不同的hbase表，并自定义写入列簇和列(join时需评估一下性能)  
  3 | 支持at least once语义  
  4 | 外部依赖：apollo配置中心，本项目依靠配置驱动，配置存储在apollo配置中心  
  5 | 配置：
  6 | ```
  7 | {
  8 |     "indexColumnMapping": {    --indexColumnMapping即CSV格式消息的key和value按照value里的分隔符拼接后再分割后下标及写入hbase列的对应关系
  9 |         "0": "basic:time",     --第0列始终是kafka消息的key，如果不需要可以不指定
 10 |         "1": "basic:user_id",
 11 |         "2": "basic:session_id",
 12 |         "3": "basic:test_mission_id",
 13 |         "4": "basic:stratege_name",
 14 |         "5": "basic:status_type",
 15 |         "6": "basic:status_id",
 16 |         "7": "basic:position",
 17 |         "8": "basic:like_count",
 18 |         "9": "basic:retweet_count",
 19 |         "10": "basic:reply_count",
 20 |         "11": "basic:fav_count",
 21 |         "12": "basic:reward_amount",
 22 |         "13": "basic:reward_user_count",
 23 |         "14": "basic:status_hot_score",
 24 |         "15": "basic:status_hot_score_norm",
 25 |         "16": "basic:user_score",
 26 |         "17": "basic:use_score_norm",
 27 |         "18": "basic:stock_score",
 28 |         "19": "basic:stock_score_norm",
 29 |         "20": "basic:tag",
 30 |         "21": "basic:tag_score",
 31 |         "22": "basic:stock_symbol",
 32 |         "23": "basic:ip",
 33 |         "24": "basic:device",
 34 |         "25": "basic:country_name",
 35 |         "26": "basic:city_name",
 36 |         "27": "basic:topic_score",
 37 |         "28": "basic:rerank_name",
 38 |         "29": "basic:author_block_count",
 39 |         "30": "basic:percent",
 40 |         "31": "basic:random_id",
 41 |         "32": "basic:rank_score",
 42 |         "33": "basic:quote_string",
 43 |         "34": "basic:click_num",
 44 |         "35": "basic:show_num",
 45 |         "36": "basic:tag_short_term_click",
 46 |         "37": "basic:tag_short_term_show",
 47 |         "38": "basic:tag_long_term_click",
 48 |         "39": "basic:tag_long_term_show",
 49 |         "40": "basic:block_count",
 50 |         "41": "basic:context_info",
 51 |         "42": "basic:recent_behavior",
 52 |         "43": "basic:basic_string",
 53 |         "44": "basic:mention_stock_rank",
 54 |         "45": "basic:text_quality_score",
 55 |         "46": "basic:last_nc_context",
 56 |         "47": "basic:keywords"
 57 |     },
 58 |     "rowKeyDelimiter": "#",    --如果rowkey是多个列的拼接，则需指定的拼接符
 59 |     "rowKeyColumns": [         --rowkey组成的列
 60 |         "basic:user_id",
 61 |         "basic:statusId"
 62 |     ],
 63 |     "tableName": "cy_test",    --数据流要写入hbase表的表名（如不存在会自动创建）
 64 |     "kafkaConfig": {           --flink接入kafka数据源的配置
 65 |         "bootstrapServers": "singgel:9092",    --kafka的broker list
 66 |         "topic": "recommend2.statistics",         --需要接入的topic
 67 |         "groupId": "flink_recommend2_statistic_join_test2",    --flink中消费kafka topic的groupId
 68 |         "delimiter": "|",          --kafka消息value的分隔符，当valueFormat=CSV时必须指定       
 69 |         "valueFormat": "CSV",      --kafka消息value的格式，目前支持"CSV"和"JSON"两种
 70 |         "optionalProps": {}        --其他kafka消费者配置
 71 |     },
 72 |     "hbaseConfig": {      --写入的hbase集群配置
 73 |         "zookerperQuorum": "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com",
 74 |         "port": "2181",
 75 |         "zookeeperZondeParent": "/hbase-unsecure",      --hbase在zookeeper中的根目录节点名称（注意：咱内部cdh集群是/hbase，此处是ambari集群hbase的配置）
 76 |         "batchCount": 100,           --批量写入的条数，与interval条件满足其一就触发写入，注：当接入的topic数据源生产速率较小时且无join时，可以设置为1，逐条写入
 77 |         "interval": 5000,            --批量写入的间隔时间
 78 |         "optionalProp": {}           --其他hbase设置
 79 |     },
 80 |     "jobName": "recommend_feature_hbase_sink_test",    --flink job的名称
 81 |     "parallelism": 8,               --flink任务执行时的平行度
 82 |     "jarName": "flink-kafka-hbase-1.0-SNAPSHOT-jar-with-dependencies.jar",    --执行flink任务的jar包，当通过实时平台界面提交flinkjob时需要指定
 83 |     "joinTables": [  --需要join的表，可以指定多个，可以为空；当要join多个表时，需要评估一下性能
 84 |         {
 85 |             "tableName": "user_feature",     --需要join的hbase表
 86 |             "joinKey": "basic:userId",       --join的字段，需要在"indexColumnMapping的values中，且是joinTable的rowKey
 87 |             "columnsMapping": {       --join表中的列和要写入表中的列的对应关系，key->fromFamily:fromColumn，value->toFamily:toColumn，from和to的列簇和列不需一致
 88 |                 "basic:pagerank": "basic :pagerank",     
 89 |                 "basic:country": "basic:country",
 90 |                 "basic:province": "basic:province",
 91 |                 "basic:city": "basic:city",
 92 |                 "basic:mobile": "basic:mobile",
 93 |                 "basic:follower_cluster": "basic:follower_cluster",
 94 |                 "basic:quality_cluster": "basic:quality_cluster",
 95 |                 "basic:symbol_cluster": "basic:symbol_cluster",
 96 |                 "basic:topic_cluster": "basic:topic_cluster",
 97 |                 "basic:stock_click7": "basic:stock_click7",
 98 |                 "basic:stock_show7": "basic:stock_show7",
 99 |                 "basic:stock_click30": "basic:stock_click30",
100 |                 "basic:stock_show30": "basic:stock_show30",
101 |                 "basic:symbol_page_enter": "basic:symbol_page_enter",
102 |                 "basic:symbol_new_status": "basic:symbol_new_status",
103 |                 "basic:symbol_hot": "basic:symbol_hot",
104 |                 "basic:symbol_finance": "basic:symbol_finance",
105 |                 "basic:symbol_news": "basic:symbol_news",
106 |                 "basic:symbol_notice": "basic:symbol_notice",
107 |                 "basic:symbol_general": "basic:symbol_general",
108 |                 "basic:symbol_page_view": "basic:symbol_page_view",
109 |                 "basic:symbol_page_origin": "basic:symbol_page_origin",
110 |                 "basic:attention_mark": "basic:attention_mark",
111 |                 "basic:rebalance_num": "basic:rebalance_num",
112 |                 "basic:topic_personal_short_click": "basic:topic_personal_short_click",
113 |                 "basic:topic_personal_short_show": "basic:topic_personal_short_show",
114 |                 "basic:topic_personal_long_click": "basic:topic_personal_long_click",
115 |                 "basic:topic_personal_long_show": "basic:topic_personal_long_show",
116 |                 "basic:dislike_1st": "basic:dislike_1st",
117 |                 "basic:dislike_2st": "basic:dislike_2st",
118 |                 "basic:dislike_3st": "basic:dislike_3st",
119 |                 "basic:dislike_4st": "basic:dislike_4st",
120 |                 "basic:dislike_5st": "basic:dislike_5st",
121 |                 "basic:familar_1st": "basic:familar_1st",
122 |                 "basic:familar_2st": "basic:familar_2st",
123 |                 "basic:familar_3st": "basic:familar_3st",
124 |                 "basic:familar_4st": "basic:familar_4st",
125 |                 "basic:familar_5st": "basic:familar_5st",
126 |                 "basic:like_1st": "basic:like_1st",
127 |                 "basic:like_2st": "basic:like_2st",
128 |                 "basic:like_3st": "basic:like_3st",
129 |                 "basic:like_4st": "basic:like_4st",
130 |                 "basic:like_5st": "basic:like_5st",
131 |                 "basic:unfamilar_1st": "basic:unfamilar_1st",
132 |                 "basic:unfamilar_2st": "basic:unfamilar_2st",
133 |                 "basic:unfamilar_3st": "basic:unfamilar_3st",
134 |                 "basic:unfamilar_4st": "basic:unfamilar_4st",
135 |                 "basic:unfamilar_5st": "basic:unfamilar_5st",
136 |                 "basic:headline_down_cnt": "basic:headline_down_cnt",
137 |                 "basic:headline_up_cnt": "basic:headline_up_cnt",
138 |                 "basic:optional_cnt": "basic:optional_cnt",
139 |                 "basic:dynamic_cnt": "basic:dynamic_cnt",
140 |                 "basic:quotation_cnt": "basic:quotation_cnt",
141 |                 "basic:base_rate": "basic:base_rate",
142 |                 "basic:mark_gegu_enter": "basic:mark_gegu_enter",
143 |                 "basic:mark_share_sum": "basic:mark_share_sum",
144 |                 "basic:mark_head_dislike_sum": "basic:mark_head_dislike_sum",
145 |                 "basic:mark_status_post_user_sum": "basic:mark_status_post_user_sum",
146 |                 "basic:mark_search_sum": "basic:mark_search_sum",
147 |                 "basic:mark_debate_post_user_num": "basic:mark_debate_post_user_num",
148 |                 "basic:author_click_week": "basic:author_click_week",
149 |                 "basic:author_show_week": "basic:author_show_week",
150 |                 "basic:author_click_month": "basic:author_click_month",
151 |                 "basic:author_show_month": "basic:author_show_month"
152 |             }
153 |         },
154 |         {
155 |             "tableName": "status_feature_string",
156 |             "joinKey": "basic:statusId",
157 |             "columnsMapping": {
158 |                 "basic:user_id": "basic:user_id",
159 |                 "basic:symbol_id": "basic:symbol_id",
160 |                 "basic:created_at": "basic:created_at",
161 |                 "basic:source": "basic:source",
162 |                 "basic:retweet_status_id": "basic:retweet_status_id",
163 |                 "basic:paid_mention_id": "basic:paid_mention_id",
164 |                 "basic:retweet_user_id": "basic:retweet_user_id",
165 |                 "basic:retweet_symbol_id": "basic:retweet_symbol_id",
166 |                 "basic:truncate": "basic:truncate",
167 |                 "basic:flags": "basic:flags",
168 |                 "basic:expired_at": "basic:expired_at",
169 |                 "basic:title_length": "basic:title_length",
170 |                 "basic:title_hash": "basic:title_hash",
171 |                 "basic:title_flag": "basic:title_flag",
172 |                 "basic:text_length": "basic:text_length",
173 |                 "basic:pic_count": "basic:pic_count",
174 |                 "basic:type": "basic:type",
175 |                 "basic:meta_classes": "basic:meta_classes",
176 |                 "basic:pic_score": "basic:pic_score",
177 |                 "basic:domain": "basic:domain",
178 |                 "basic:url_hash": "basic:url_hash",
179 |                 "basic:character_percent": "basic:character_percent",
180 |                 "basic:symbol": "basic:symbol",
181 |                 "basic:keyword": "basic:keyword",
182 |                 "basic:match_word": "basic:match_word",
183 |                 "basic:keyword_title": "basic:keyword_title",
184 |                 "basic:keyword_des": "basic:keyword_des",
185 |                 "basic:symbol_title": "basic:symbol_title",
186 |                 "basic:symbol_sim_title": "basic:symbol_sim_title",
187 |                 "basic:symbol_des": "basic:symbol_des",
188 |                 "basic:symbol_sim_des": "basic:symbol_sim_des",
189 |                 "basic:symbol_content": "basic:symbol_content",
190 |                 "basic:symbol_sim_content": "basic:symbol_sim_content"
191 |             }
192 |         }
193 |     ]
194 | }
195 | ```
196 | 
197 | 在flink任务启动时，会去apollo配置中心取指定的配置，根据配置执行任务。
198 | 
199 | 关键实现HbaseSink代码如下：
200 | ```
201 | @Slf4j
202 | public class HbaseSink extends RichSinkFunction<ObjectNode> implements CheckpointedFunction {
203 |  
204 |     private final JobConfig jobConfig;
205 |  
206 |     private HbaseUtil hbaseUtil;
207 |  
208 |     private long currentTime = System.currentTimeMillis();
209 |  
210 |     /**
211 |      * 在flink任务自动重试时，会先恢复state中的数据；如果是cancel掉flink任务，重新手动提交，则state会清空
212 |      */
213 |     private transient ListState<ObjectNode> checkpointedState;
214 |  
215 |     private List<ObjectNode> nodes = new ArrayList<>();
216 |  
217 |     private static ObjectMapper MAPPER = new ObjectMapper();
218 |  
219 |     private StringBuilder sbLog = new StringBuilder();
220 |  
221 |     public HbaseSink(JobConfig jobConfig) {
222 |         this.jobConfig = jobConfig;
223 |     }
224 |  
225 |  
226 |     @Override
227 |     public void open(Configuration parameters) throws Exception {
228 |         super.open(parameters);
229 |         this.hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig());
230 |     }
231 |  
232 |     /**
233 |      * 在手动cancel和程序内部出错重试时都会触发close方法，在close方法中将nodes中的数据先flush，防止在两次写入之间，checkpoint点之前的数据丢失
234 |      * 但会有数据重复，checkpoint点之后到发生故障时的数据会重复，如下示意图：
235 |      * <pre>
236 |      *                       flush
237 |      *                        ^
238 |      *                       /
239 |      *            +--------------------------+
240 |      * ------write------checkpoint-----------down----
241 |      *                           +-----------+
242 |      *                                 ^
243 |      *                                /
244 |      *                           will repeat
245 |      * </pre>
246 |      * <p>
247 |      * 但对于写入具有幂等性的业务，数据重复写入不会影响结果
248 |      *
249 |      * @throws Exception
250 |      */
251 |     @Override
252 |     public void close() throws Exception {
253 |         log.debug("execute sink close method");
254 |         super.close();
255 |         batchFlush();
256 |         if (this.hbaseUtil.getConnection() != null) {
257 |             try {
258 |                 this.hbaseUtil.getConnection().close();
259 |             } catch (Exception e) {
260 |                 log.warn("connection close failed. error:{} ", e.getMessage());
261 |             }
262 |         }
263 |     }
264 |  
265 |     /**
266 |      * 每条记录调用一次此方法
267 |      *
268 |      * @param node
269 |      * @param context
270 |      * @throws Exception
271 |      */
272 |     @Override
273 |     public void invoke(ObjectNode node, Context context) throws Exception {
274 |         String partition = node.get("metadata").get("partition").asText();
275 |         String offset = node.get("metadata").get("offset").asText();
276 |         String value = node.get("value").asText();
277 |         log.debug("partition->{}|offset->{}|value->{}", partition, offset, value);
278 |         nodes.add(node);
279 |         if (nodes.size() >= jobConfig.getHbaseConfig().getBatchCount() ||
280 |                 (System.currentTimeMillis() - currentTime > jobConfig.getHbaseConfig().getInterval() && nodes.size() > 0)) {
281 |             batchFlush();
282 |         }
283 |     }
284 |  
285 |     /**
286 |      * 将{@link HbaseSink#nodes}中的数据批量写入
287 |      *
288 |      * @throws IOException
289 |      */
290 |     private void batchFlush() throws IOException {
291 |         long start = System.currentTimeMillis();
292 |         List<Put> puts = convertBatch(nodes);
293 |         if (puts.size() == 0) {
294 |             return;
295 |         }
296 |         long startPut = System.currentTimeMillis();
297 |         hbaseUtil.putBatchData(jobConfig.getTableName(), puts);
298 |         long end = System.currentTimeMillis();
299 |         sbLog.append(String.format(" | batch_put(%d) cost %d ms", puts.size(), end - startPut));
300 |         sbLog.append(String.format(" | batch_total(%d) cost %d ms", puts.size(), end - start));
301 |         sbLog.append(String.format(" | per record cost %d ms", (end - start) / puts.size()));
302 |         log.debug(sbLog.toString());
303 |         currentTime = System.currentTimeMillis();
304 |         sbLog = new StringBuilder();
305 |         nodes.clear();
306 |     }
307 |  
308 |     /**
309 |      * 批量处理
310 |      *
311 |      * @param objectNodes 一批数据
312 |      * @return 返回批量的Put
313 |      */
314 |     private List<Put> convertBatch(List<ObjectNode> objectNodes) throws IOException {
315 |         Map<Put, Map<String, String>> puts = new HashMap<>(objectNodes.size());
316 |         //存储每个需要join的表中这个批次的rowkey的值
317 |         Map<String, Set<String>> joinKeys = new HashMap<>(objectNodes.size());
318 |  
319 |         for (ObjectNode node : objectNodes) {
320 |             Map<String, String> keyValues = getKeyValues(node);
321 |  
322 |             //获取拼接的rowKey
323 |             List<String> rowKeyValues = new ArrayList<>();
324 |             jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e)));
325 |             if (rowKeyValues.stream().anyMatch(Objects::isNull)) {
326 |                 //如果组合rowKey的字段中有null，则过滤掉此记录
327 |                 log.warn("columns which consist of rowKey has null value");
328 |                 continue;
329 |             }
330 |             String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues);
331 |             Put put = new Put(Bytes.toBytes(rowKey));
332 |  
333 |             //获取这个joinTable表中这个批次所有需要join的key
334 |             for (JoinTable joinTable : jobConfig.getJoinTables()) {
335 |  
336 |                 joinKeys.compute(joinTable.getTableName(), (k, v) -> {
337 |                     //keyValues.get(joinTable.getJoinKey()的值有可能为null,需做空判断
338 |                     if (keyValues.get(joinTable.getJoinKey()) != null) {
339 |                         if (v == null) {
340 |                             v = new HashSet<>();
341 |                             v.add(keyValues.get(joinTable.getJoinKey()));
342 |                         } else {
343 |                             v.add(keyValues.get(joinTable.getJoinKey()));
344 |                         }
345 |                     }
346 |                     return v;
347 |                 });
348 |             }
349 |  
350 |  
351 |             //原始topic需要写入的列
352 |             keyValues.forEach((k, v) -> {
353 |                 String family = k.split(":")[0];
354 |                 String column = k.split(":")[1];
355 |                 put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), v != null ? Bytes.toBytes(v) : null);
356 |             });
357 |  
358 |             puts.put(put, keyValues);
359 |         }
360 |  
361 |         //当需要join时执行下面操作
362 |         for (JoinTable joinTable : jobConfig.getJoinTables()) {
363 |             //取出这个joinTable表中这个批次所有需要join的key
364 |             Set<String> keys = joinKeys.get(joinTable.getTableName());
365 |             List<Get> gets = new ArrayList<>();
366 |             //将key和result一一对应
367 |             Map<String, Result> keyResults = new HashMap<>(keys.size());
368 |  
369 |             //生成需要批量get的List<Get>
370 |             keys.forEach(e -> {
371 |                 Get get = new Get(Bytes.toBytes(e));
372 |                 joinTable.getColumnsMapping().forEach((k, v) -> {
373 |                     get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
374 |                 });
375 |                 gets.add(get);
376 |             });
377 |  
378 |  
379 |             long start = System.currentTimeMillis();
380 |             //执行批量get
381 |             Result[] results = hbaseUtil.batchGet(joinTable.getTableName(), gets);
382 |             for (Result result : results) {
383 |                 if (result != null) {
384 |                     keyResults.put(Bytes.toString(result.getRow()), result);
385 |                 }
386 |             }
387 |             long end = System.currentTimeMillis();
388 |  
389 |             sbLog.append(String.format("| batch_get %s(%d) %d ms", joinTable.getTableName(), keys.size(), (end - start)));
390 |             //对之前原始写入的每个put,获取这个表需要join的rowKey的result,然后将result中的值根据joinTable的配置添加到put的对应列中
391 |             puts.forEach((put, keyValues) -> {
392 |                 Result result = keyResults.get(keyValues.get(joinTable.getJoinKey()));
393 |                 if (result != null) {
394 |                     joinTable.getColumnsMapping().forEach((k, v) -> {
395 |                         byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
396 |                         put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue);
397 |                     });
398 |                 }
399 |             });
400 |         }
401 |  
402 |  
403 |         return new ArrayList<>(puts.keySet());
404 |     }
405 |  
406 |     /**
407 |      * 根据配置中给定的列值对应关系，将每条消息解析成<key,value>格式，key为配置中指定的列名(包含列簇)
408 |      * 目前支持两种消息格式：CSV和JSON格式的字符串型数据,
409 |      * 在处理消息时，kafka消息的key默认会被集成到value中， 对于CSV格式，kafka消息的key处在index=0的位置；对于JSON格式，kafka消息的key对应默认的kafka_key字段
410 |      *
411 |      * @param node flink接入的kafka消息
412 |      * @return 返回字段名称对应的值
413 |      */
414 |     private Map<String, String> getKeyValues(ObjectNode node) {
415 |         Map<String, String> indexColumns = jobConfig.getIndexColumnMapping();
416 |         String key = node.get("key") == null ? "" : node.get("key").asText();
417 |         String value = node.get("value") == null ? "" : node.get("value").asText();
418 |  
419 |  
420 |         Map<String, String> keyValues = new HashMap<>(8);
421 |  
422 |         ValueFormat valueFormat = jobConfig.getKafkaConfig().getValueFormat();
423 |         switch (valueFormat) {
424 |             case CSV:
425 |                 //将key和value拼接起来，配置时kafka的key值作为下标的第0个
426 |                 String input = key + jobConfig.getKafkaConfig().getDelimiter() + value;
427 |                 String[] columnValues = StringUtils.splitPreserveAllTokens(input, jobConfig.getKafkaConfig().getDelimiter());
428 |  
429 |                 //将index对应的列值写入对应的列名下，列名包含了列簇名，形如：family:qualifier
430 |                 for (Map.Entry<String, String> entry : indexColumns.entrySet()) {
431 |                     try {
432 |                         keyValues.put(entry.getValue(), columnValues[Integer.valueOf(entry.getKey())]);
433 |                     } catch (Exception e) {
434 |  
435 |                         log.warn("index {} out of boundary.", entry.getKey(), e);
436 |                     }
437 |                 }
438 |  
439 |                 break;
440 |             case JSON:
441 |             default:
442 |                 //将kafka的key加入node，统一处理
443 |                 try {
444 |  
445 |                     ObjectNode jsonNode = (ObjectNode) MAPPER.readTree(value);
446 |  
447 |                     jsonNode.put("kafka_key", key);
448 |  
449 |                     //将配置中指定的列值写入对应的列名下，列名包含了列簇名，形如：family:qualifier
450 |                     indexColumns.forEach((k, v) -> {
451 |                         if (jsonNode.get(k) != null) {
452 |                             keyValues.put(v, jsonNode.get(k).asText());
453 |                         }
454 |                     });
455 |                 } catch (IOException e) {
456 |                     keyValues.clear();
457 |                     indexColumns.forEach((k, v) -> keyValues.put(v, null));
458 |                     String partition = node.get("metadata").get("partition").asText();
459 |                     String offset = node.get("metadata").get("offset").asText();
460 |                     String topic = node.get("metadata").get("topic").asText();
461 |                     log.warn("this json record failed.topic->{},partition->{},offset->{},value->{}", topic, partition, offset, value);
462 |                 }
463 |                 break;
464 |         }
465 |         return keyValues;
466 |     }
467 |  
468 |     /**
469 |      * 执行频率和{@link FlinkRunner}中指定的checkpoint间隔一致
470 |      *
471 |      * @param context
472 |      * @throws Exception
473 |      */
474 |     @Override
475 |     public void snapshotState(FunctionSnapshotContext context) throws Exception {
476 |         checkpointedState.clear();
477 |         for (ObjectNode element : nodes) {
478 |             checkpointedState.add(element);
479 |         }
480 |         log.debug("execute snapshot at {}", System.currentTimeMillis());
481 |     }
482 |  
483 |     /**
484 |      * 在程序内部出错重启时，如果调用了snapshotState方法，则会恢复checkpointedState中的数据，如果是手动cancel或重试几次失败后重新提交任务，此时
485 |      * 的checkpointedState会是新的对象，里面没有数据
486 |      *
487 |      * @param context
488 |      * @throws Exception
489 |      */
490 |     @Override
491 |     public void initializeState(FunctionInitializationContext context) throws Exception {
492 |         ListStateDescriptor<ObjectNode> descriptor =
493 |                 new ListStateDescriptor<>(
494 |                         "hbase-sink-cp",
495 |                         TypeInformation.of(new TypeHint<ObjectNode>() {
496 |                         }));
497 |  
498 |         checkpointedState = context.getOperatorStateStore().getListState(descriptor);
499 |  
500 |         if (context.isRestored()) {
501 |             for (ObjectNode element : checkpointedState.get()) {
502 |                 nodes.add(element);
503 |             }
504 |         }
505 |         log.info("initialState {}  record", nodes.size());
506 |     }
507 | }
508 | ```
509 | 
510 | 开发过程中遇到的问题主要有两点，一是处理速度，二是批量处理时出错数据丢失的问题。
511 | 
512 | 对于处理速度的优化：
513 | 
514 | （1）由最开始的单条写入改成批量写入，但在获取joinTable的列时依然是逐条获取，每个rowkey调用一次get方法，比较费时
515 | 
516 | （2）将joinTable的逐条get，改成批量get，速度提升了4-5倍，一是因为减少了提交请求的次数，加快返回速度；二是因为短时间内recommend2.statistics记录user_id和status_id分别都有重复，批量时可以减少实际查询rowkey的个数进而节省时间。
517 | 
518 | （3）尽管做了以上两点优化，但速度还是很慢，经过打日志发现，主要是user_feature这个表获取rowkey的值太慢，于是又在发送get请求时做了如下优化： 
519 | 
520 | 因为user_feature的所有列都插入到集成的表中，一开始就没有在get请求时指定要获取的列簇和列名，优化就是在提交get请求时，指定所有需要获取的列簇和列名，这样明显快很多，大概提升10倍，此时写入一条join后的数据大概耗时2-3ms
521 | | batch_get user_feature(139) 134 ms| batch_get status_feature_string(160) 59 ms | batch_put(200) cost 195 ms | batch_total(200) cost 433 ms | per record cost 2 ms
522 | | batch_get user_feature(134) 132 ms| batch_get status_feature_string(169) 56 ms | batch_put(200) cost 201 ms | batch_total(200) cost 434 ms | per record cost 2 ms 
523 | 
524 | 对于处理出错数据丢失的问题：
525 | 
526 | 数据丢失的场景：(1)当HbaseSink中调用了多次invoke方法，nodes中累积了一定的数量，但还没有触发写入操作，此时flink程序由于某种原因失败了自动重启，之前nodes中累积的记录就会丢失。
527 | 
528 | 怎样做到数据不丢失？
529 | 
530 | （1）让HbaseSink实现CheckpointedFunction接口，实现snapshotState和initializeState方法，snapshotState的调用频率和FlinkRunner中指定的checkpoint的频率一致，每次checkpoint会提交kafka的offset，并执行snapshotState方法，在snapshotState方法中，会将nodes中的元素加入到checkpointState中，当flink程序失败自动重启后，initializeState方法会从checkpointState中恢复nodes中的数据，接着处理。
531 | 
532 | （2）当flink任务重试几次失败导致任务最终失败或者手动停止flink任务，再重新提交flink任务时，checkpointedState会是新的对象，不会保存上次任务失败或停止时nodes中的数据，这种情况依然会丢数据，因为程序失败自动重启和手动停止时都会调用close方法，因此在close方法中调用batchFlush方法，先写入再关闭。但重新启动时，从上次checkpoint到停止时的消息会重复处理。
533 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.singgel.bigdata</groupId>
  8 |     <artifactId>flink-kafka-hbase</artifactId>
  9 |     <version>1.0</version>
 10 | 
 11 |     <properties>
 12 |         <flink.version>1.7.0</flink.version>
 13 |     </properties>
 14 | 
 15 |     <dependencies>
 16 | 
 17 |         <dependency>
 18 |             <groupId>org.apache.flink</groupId>
 19 |             <artifactId>flink-streaming-scala_2.11</artifactId>
 20 |             <version>${flink.version}</version>
 21 |         </dependency>
 22 | 
 23 |         <dependency>
 24 |             <groupId>org.apache.flink</groupId>
 25 |             <artifactId>flink-table_2.11</artifactId>
 26 |             <version>${flink.version}</version>
 27 |         </dependency>
 28 |         <dependency>
 29 |             <groupId>org.apache.flink</groupId>
 30 |             <artifactId>flink-json</artifactId>
 31 |             <version>1.7.0</version>
 32 |         </dependency>
 33 | 
 34 |         <dependency>
 35 |             <groupId>org.apache.flink</groupId>
 36 |             <artifactId>flink-connector-kafka_2.11</artifactId>
 37 |             <version>${flink.version}</version>
 38 |         </dependency>
 39 | 
 40 |         <dependency>
 41 |             <groupId>com.fasterxml.jackson.core</groupId>
 42 |             <artifactId>jackson-databind</artifactId>
 43 |             <version>2.13.4.2</version>
 44 |         </dependency>
 45 | 
 46 |         <dependency>
 47 |             <groupId>org.apache.flink</groupId>
 48 |             <artifactId>flink-hbase_2.11</artifactId>
 49 |             <version>${flink.version}</version>
 50 |         </dependency>
 51 | 
 52 |         <dependency>
 53 |             <groupId>org.apache.hadoop</groupId>
 54 |             <artifactId>hadoop-common</artifactId>
 55 |             <version>3.2.4</version>
 56 |         </dependency>
 57 | 
 58 |         <dependency>
 59 |             <groupId>com.squareup.okhttp3</groupId>
 60 |             <artifactId>okhttp</artifactId>
 61 |             <version>3.12.1</version>
 62 |         </dependency>
 63 | 
 64 |         <dependency>
 65 |             <groupId>junit</groupId>
 66 |             <artifactId>junit</artifactId>
 67 |             <version>4.13.1</version>
 68 |             <scope>test</scope>
 69 |         </dependency>
 70 | 
 71 |         <dependency>
 72 |             <groupId>com.typesafe</groupId>
 73 |             <artifactId>config</artifactId>
 74 |             <version>1.3.1</version>
 75 |         </dependency>
 76 | 
 77 |         <dependency>
 78 |             <groupId>org.projectlombok</groupId>
 79 |             <artifactId>lombok</artifactId>
 80 |             <version>1.18.4</version>
 81 |             <scope>provided</scope>
 82 |         </dependency>
 83 | 
 84 |         <!-- logger start -->
 85 |         <dependency>
 86 |             <groupId>org.slf4j</groupId>
 87 |             <artifactId>slf4j-api</artifactId>
 88 |             <version>1.7.25</version>
 89 |         </dependency>
 90 | 
 91 |         <dependency>
 92 |             <groupId>ch.qos.logback</groupId>
 93 |             <artifactId>logback-classic</artifactId>
 94 |             <version>1.2.3</version>
 95 |         </dependency>
 96 | 
 97 |         <dependency>
 98 |             <groupId>org.slf4j</groupId>
 99 |             <artifactId>jcl-over-slf4j</artifactId>
100 |             <version>1.7.25</version>
101 |         </dependency>
102 |         <!-- logger end -->
103 | 
104 | 
105 |     </dependencies>
106 | 
107 | 
108 |     <build>
109 |         <finalName>${artifactId}-${version}</finalName>
110 |         <plugins>
111 |             <plugin>
112 |                 <groupId>org.apache.maven.plugins</groupId>
113 |                 <artifactId>maven-compiler-plugin</artifactId>
114 |                 <configuration>
115 |                     <source>8</source>
116 |                     <target>8</target>
117 |                 </configuration>
118 |             </plugin>
119 |             <plugin>
120 |                 <groupId>org.apache.maven.plugins</groupId>
121 |                 <artifactId>maven-assembly-plugin</artifactId>
122 |                 <version>2.4.1</version>
123 |                 <configuration>
124 |                     <!-- get all project dependencies -->
125 |                     <descriptorRefs>
126 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
127 |                     </descriptorRefs>
128 |                     <!-- MainClass in mainfest make a executable jar -->
129 |                     <archive>
130 |                         <manifest>
131 |                             <mainClass>com.singgel.bigdata.flinksinkhbase.FlinkRunner</mainClass>
132 |                         </manifest>
133 |                     </archive>
134 |                 </configuration>
135 |                 <executions>
136 |                     <execution>
137 |                         <id>make-assembly</id>
138 |                         <!-- bind to the packaging phase -->
139 |                         <phase>package</phase>
140 |                         <goals>
141 |                             <goal>single</goal>
142 |                         </goals>
143 |                     </execution>
144 |                 </executions>
145 |             </plugin>
146 | 
147 |         </plugins>
148 |     </build>
149 | 
150 | </project>


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/FlinkRunner.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.flinksinkhbase;
 2 | 
 3 | import com.singgel.bigdata.flinksinkhbase.common.CustomJsonDeserializationSchema;
 4 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil;
 5 | import com.singgel.bigdata.flinksinkhbase.common.JobConfigManager;
 6 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig;
 7 | import lombok.extern.slf4j.Slf4j;
 8 | import org.apache.flink.api.common.restartstrategy.RestartStrategies;
 9 | import org.apache.flink.api.common.time.Time;
10 | import org.apache.flink.api.java.utils.ParameterTool;
11 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode;
12 | import org.apache.flink.streaming.api.CheckpointingMode;
13 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
14 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
15 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
16 | import org.apache.hadoop.hbase.client.Put;
17 | 
18 | import java.util.*;
19 | import java.util.concurrent.TimeUnit;
20 | 
21 | /**
22 |  * \* @author singgel
23 |  * \* @created_at: 2019/3/24 下午1:49
24 |  * \
25 |  */
26 | @Slf4j
27 | public class FlinkRunner {
28 | 
29 |     public static void main(String[] args) throws Exception {
30 | 
31 |         String jobKey;
32 |         try {
33 |             final ParameterTool params = ParameterTool.fromArgs(args);
34 |             jobKey = params.get("jobKey");
35 |         } catch (Exception e) {
36 |             System.err.println("No jobKey specified. Please run 'FlinkRunner --jobKey <jobKey>'");
37 |             return;
38 |         }
39 | 
40 |         JobConfig jobConfig = JobConfigManager.getConfigByKey(jobKey);
41 |         jobConfig.validate();
42 | 
43 |         HbaseUtil hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig());
44 |         hbaseUtil.prepareTable(jobConfig.getTableName(), jobConfig.families());
45 | 
46 |         final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
47 | 
48 |         env.getConfig().registerKryoType(JobConfig.class);
49 |         env.getConfig().registerKryoType(Put.class);
50 |         env.getConfig().registerKryoType(HbaseUtil.class);
51 | 
52 |         Properties prop = jobConfig.getKafkaConfig().kafkaProps();
53 |         DataStreamSource<ObjectNode> dataStreamSource = env.addSource(new FlinkKafkaConsumer<>(jobConfig.getKafkaConfig().getTopic(),
54 |                 new CustomJsonDeserializationSchema(true), prop));
55 | 
56 |         dataStreamSource.addSink(new HbaseSink(jobConfig));
57 | 
58 |         //设置最大失败重启尝试次数及每次重启间隔时间
59 |         env.setRestartStrategy(RestartStrategies.fixedDelayRestart(
60 |                 3,
61 |                 Time.of(10L, TimeUnit.SECONDS)
62 |         ));
63 | 
64 |         env.enableCheckpointing(JobConfig.CHECKPOINT_INTERVAR, CheckpointingMode.EXACTLY_ONCE);
65 | 
66 |         env.execute(jobConfig.getJobName());
67 |     }
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/HbaseSink.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.flinksinkhbase;
  2 | 
  3 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil;
  4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable;
  5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat;
  6 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig;
  7 | import lombok.extern.slf4j.Slf4j;
  8 | import org.apache.commons.lang3.StringUtils;
  9 | import org.apache.flink.api.common.state.ListState;
 10 | import org.apache.flink.api.common.state.ListStateDescriptor;
 11 | import org.apache.flink.api.common.typeinfo.TypeHint;
 12 | import org.apache.flink.api.common.typeinfo.TypeInformation;
 13 | import org.apache.flink.configuration.Configuration;
 14 | import org.apache.flink.runtime.state.FunctionInitializationContext;
 15 | import org.apache.flink.runtime.state.FunctionSnapshotContext;
 16 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
 17 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode;
 18 | import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction;
 19 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
 20 | import org.apache.hadoop.hbase.client.Get;
 21 | import org.apache.hadoop.hbase.client.Put;
 22 | import org.apache.hadoop.hbase.client.Result;
 23 | import org.apache.hadoop.hbase.util.Bytes;
 24 | 
 25 | import java.io.IOException;
 26 | import java.util.*;
 27 | 
 28 | /**
 29 |  * \* @author singgel
 30 |  * \* @created_at: 2019/3/31 下午12:29
 31 |  * \
 32 |  */
 33 | 
 34 | @Slf4j
 35 | public class HbaseSink extends RichSinkFunction<ObjectNode> implements CheckpointedFunction {
 36 | 
 37 |     private final JobConfig jobConfig;
 38 | 
 39 |     private HbaseUtil hbaseUtil;
 40 | 
 41 |     private long currentTime = System.currentTimeMillis();
 42 | 
 43 |     /**
 44 |      * 在flink任务自动重试时，会先恢复state中的数据；如果是cancel掉flink任务，重新手动提交，则state会清空
 45 |      */
 46 |     private transient ListState<ObjectNode> checkpointedState;
 47 | 
 48 |     private List<ObjectNode> nodes = new ArrayList<>();
 49 | 
 50 |     private static ObjectMapper MAPPER = new ObjectMapper();
 51 | 
 52 |     private StringBuilder sbLog = new StringBuilder();
 53 | 
 54 |     public HbaseSink(JobConfig jobConfig) {
 55 |         this.jobConfig = jobConfig;
 56 |     }
 57 | 
 58 | 
 59 |     @Override
 60 |     public void open(Configuration parameters) throws Exception {
 61 |         super.open(parameters);
 62 |         this.hbaseUtil = new HbaseUtil(jobConfig.getHbaseConfig());
 63 |     }
 64 | 
 65 |     /**
 66 |      * 在手动cancel和程序内部出错重试时都会触发close方法，在close方法中将nodes中的数据先flush，防止在两次写入之间，checkpoint点之前的数据丢失
 67 |      * 但会有数据重复，checkpoint点之后到发生故障时的数据会重复，如下示意图：
 68 |      * <pre>
 69 |      *                       flush
 70 |      *                        ^
 71 |      *                       /
 72 |      *            +--------------------------+
 73 |      * ------write------checkpoint-----------down----
 74 |      *                           +-----------+
 75 |      *                                 ^
 76 |      *                                /
 77 |      *                           will repeat
 78 |      * </pre>
 79 |      * <p>
 80 |      * 但对于写入具有幂等性的业务，数据重复写入不会影响结果
 81 |      *
 82 |      * @throws Exception
 83 |      */
 84 |     @Override
 85 |     public void close() throws Exception {
 86 |         log.debug("execute sink close method");
 87 |         super.close();
 88 |         batchFlush();
 89 |         if (this.hbaseUtil.getConnection() != null) {
 90 |             try {
 91 |                 this.hbaseUtil.getConnection().close();
 92 |             } catch (Exception e) {
 93 |                 log.warn("connection close failed. error:{} ", e.getMessage());
 94 |             }
 95 |         }
 96 |     }
 97 | 
 98 |     /**
 99 |      * 每条记录调用一次此方法
100 |      *
101 |      * @param node
102 |      * @param context
103 |      * @throws Exception
104 |      */
105 |     @Override
106 |     public void invoke(ObjectNode node, Context context) throws Exception {
107 |         String partition = node.get("metadata").get("partition").asText();
108 |         String offset = node.get("metadata").get("offset").asText();
109 |         String value = node.get("value").asText();
110 |         log.debug("partition->{}|offset->{}|value->{}", partition, offset, value);
111 |         nodes.add(node);
112 |         if (nodes.size() >= jobConfig.getHbaseConfig().getBatchCount() ||
113 |                 (System.currentTimeMillis() - currentTime > jobConfig.getHbaseConfig().getInterval() && nodes.size() > 0)) {
114 |             batchFlush();
115 |         }
116 |     }
117 | 
118 |     /**
119 |      * 将{@link HbaseSink#nodes}中的数据批量写入
120 |      *
121 |      * @throws IOException
122 |      */
123 |     private void batchFlush() throws IOException {
124 |         long start = System.currentTimeMillis();
125 |         List<Put> puts = convertBatch(nodes);
126 |         if (puts.size() == 0) {
127 |             return;
128 |         }
129 |         long startPut = System.currentTimeMillis();
130 |         hbaseUtil.putBatchData(jobConfig.getTableName(), puts);
131 |         long end = System.currentTimeMillis();
132 |         sbLog.append(String.format(" | batch_put(%d) cost %d ms", puts.size(), end - startPut));
133 |         sbLog.append(String.format(" | batch_total(%d) cost %d ms", puts.size(), end - start));
134 |         sbLog.append(String.format(" | per record cost %d ms", (end - start) / puts.size()));
135 |         log.debug(sbLog.toString());
136 |         currentTime = System.currentTimeMillis();
137 |         sbLog = new StringBuilder();
138 |         nodes.clear();
139 |         checkpointedState.clear();
140 |     }
141 | 
142 |     /**
143 |      * 批量处理
144 |      *
145 |      * @param objectNodes 一批数据
146 |      * @return 返回批量的Put
147 |      */
148 |     private List<Put> convertBatch(List<ObjectNode> objectNodes) throws IOException {
149 |         Map<Put, Map<String, String>> puts = new HashMap<>(objectNodes.size());
150 |         //存储每个需要join的表中这个批次的rowkey的值
151 |         Map<String, Set<String>> joinKeys = new HashMap<>(objectNodes.size());
152 | 
153 |         for (ObjectNode node : objectNodes) {
154 |             Map<String, String> keyValues = getKeyValues(node);
155 | 
156 |             //获取拼接的rowKey
157 |             List<String> rowKeyValues = new ArrayList<>();
158 |             jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e)));
159 |             if (rowKeyValues.stream().anyMatch(Objects::isNull)) {
160 |                 //如果组合rowKey的字段中有null，则过滤掉此记录
161 |                 log.warn("columns which consist of rowKey has null value");
162 |                 continue;
163 |             }
164 |             String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues);
165 |             Put put = new Put(Bytes.toBytes(rowKey));
166 | 
167 |             //获取这个joinTable表中这个批次所有需要join的key
168 |             for (JoinTable joinTable : jobConfig.getJoinTables()) {
169 | 
170 |                 joinKeys.compute(joinTable.getTableName(), (k, v) -> {
171 |                     //keyValues.get(joinTable.getJoinKey()的值有可能为null,需做空判断
172 |                     if (keyValues.get(joinTable.getJoinKey()) != null) {
173 |                         if (v == null) {
174 |                             v = new HashSet<>();
175 |                             v.add(keyValues.get(joinTable.getJoinKey()));
176 |                         } else {
177 |                             v.add(keyValues.get(joinTable.getJoinKey()));
178 |                         }
179 |                     }
180 |                     return v;
181 |                 });
182 |             }
183 | 
184 | 
185 |             //原始topic需要写入的列
186 |             keyValues.forEach((k, v) -> {
187 |                 String family = k.split(":")[0];
188 |                 String column = k.split(":")[1];
189 |                 put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), v != null ? Bytes.toBytes(v) : null);
190 |             });
191 | 
192 |             puts.put(put, keyValues);
193 |         }
194 | 
195 |         //当需要join时执行下面操作
196 |         for (JoinTable joinTable : jobConfig.getJoinTables()) {
197 |             //取出这个joinTable表中这个批次所有需要join的key
198 |             Set<String> keys = joinKeys.get(joinTable.getTableName());
199 |             List<Get> gets = new ArrayList<>();
200 |             //将key和result一一对应
201 |             Map<String, Result> keyResults = new HashMap<>(keys.size());
202 | 
203 |             //生成需要批量get的List<Get>
204 |             keys.forEach(e -> {
205 |                 Get get = new Get(Bytes.toBytes(e));
206 |                 joinTable.getColumnsMapping().forEach((k, v) -> {
207 |                     get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
208 |                 });
209 |                 gets.add(get);
210 |             });
211 | 
212 | 
213 |             long start = System.currentTimeMillis();
214 |             //执行批量get
215 |             Result[] results = hbaseUtil.batchGet(joinTable.getTableName(), gets);
216 |             for (Result result : results) {
217 |                 if (result != null) {
218 |                     keyResults.put(Bytes.toString(result.getRow()), result);
219 |                 }
220 |             }
221 |             long end = System.currentTimeMillis();
222 | 
223 |             sbLog.append(String.format("| batch_get %s(%d) %d ms", joinTable.getTableName(), keys.size(), (end - start)));
224 |             //对之前原始写入的每个put,获取这个表需要join的rowKey的result,然后将result中的值根据joinTable的配置添加到put的对应列中
225 |             puts.forEach((put, keyValues) -> {
226 |                 Result result = keyResults.get(keyValues.get(joinTable.getJoinKey()));
227 |                 if (result != null) {
228 |                     joinTable.getColumnsMapping().forEach((k, v) -> {
229 |                         byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
230 |                         put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue);
231 |                     });
232 |                 }
233 |             });
234 |         }
235 | 
236 | 
237 |         return new ArrayList<>(puts.keySet());
238 |     }
239 | 
240 |     private Put convert(ObjectNode node) throws IOException {
241 | 
242 |         Map<String, String> keyValues = getKeyValues(node);
243 | 
244 |         //获取拼接的rowKey
245 |         List<String> rowKeyValues = new ArrayList<>();
246 | 
247 |         jobConfig.getRowKeyColumns().forEach(e -> rowKeyValues.add(keyValues.get(e)));
248 |         String rowKey = String.join(jobConfig.getRowKeyDelimiter(), rowKeyValues);
249 | 
250 |         Put put = new Put(Bytes.toBytes(rowKey));
251 | 
252 |         //原始topic需要写入的列
253 |         keyValues.forEach((k, v) -> {
254 |             String family = k.split(":")[0];
255 |             String column = k.split(":")[1];
256 |             put.addColumn(Bytes.toBytes(family), Bytes.toBytes(column), Bytes.toBytes(v));
257 |         });
258 | 
259 |         //需要join的table
260 |         for (JoinTable joinTable : jobConfig.getJoinTables()) {
261 |             byte[] joinKey = Bytes.toBytes(keyValues.get(joinTable.getJoinKey()));
262 |             Get get = new Get(joinKey);
263 |             joinTable.getColumnsMapping().forEach((k, v) -> {
264 |                 get.addColumn(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
265 |             });
266 |             //获取此rowKey所有的列值
267 |             Result result = hbaseUtil.singleGet(joinTable.getTableName(), get);
268 |             //如果result为空，则不需处理
269 |             if (result != null) {
270 |                 joinTable.getColumnsMapping().forEach((k, v) -> {
271 |                     byte[] columnValue = result.getValue(Bytes.toBytes(k.split(":")[0]), Bytes.toBytes(k.split(":")[1]));
272 |                     put.addColumn(Bytes.toBytes(v.split(":")[0]), Bytes.toBytes(v.split(":")[1]), columnValue);
273 |                 });
274 |             }
275 |         }
276 |         return put;
277 |     }
278 | 
279 |     /**
280 |      * 根据配置中给定的列值对应关系，将每条消息解析成<key,value>格式，key为配置中指定的列名(包含列簇)
281 |      * 目前支持两种消息格式：CSV和JSON格式的字符串型数据,
282 |      * 在处理消息时，kafka消息的key默认会被集成到value中， 对于CSV格式，kafka消息的key处在index=0的位置；对于JSON格式，kafka消息的key对应默认的kafka_key字段
283 |      *
284 |      * @param node flink接入的kafka消息
285 |      * @return 返回字段名称对应的值
286 |      */
287 |     private Map<String, String> getKeyValues(ObjectNode node) {
288 |         Map<String, String> indexColumns = jobConfig.getIndexColumnMapping();
289 |         String key = node.get("key") == null ? "" : node.get("key").asText();
290 |         String value = node.get("value") == null ? "" : node.get("value").asText();
291 | 
292 | 
293 |         Map<String, String> keyValues = new HashMap<>(8);
294 | 
295 |         ValueFormat valueFormat = jobConfig.getKafkaConfig().getValueFormat();
296 |         switch (valueFormat) {
297 |             case CSV:
298 |                 //将key和value拼接起来，配置时kafka的key值作为下标的第0个
299 |                 String input = key + jobConfig.getKafkaConfig().getDelimiter() + value;
300 |                 String[] columnValues = StringUtils.splitPreserveAllTokens(input, jobConfig.getKafkaConfig().getDelimiter());
301 | 
302 |                 //将index对应的列值写入对应的列名下，列名包含了列簇名，形如：family:qualifier
303 |                 for (Map.Entry<String, String> entry : indexColumns.entrySet()) {
304 |                     try {
305 |                         keyValues.put(entry.getValue(), columnValues[Integer.valueOf(entry.getKey())]);
306 |                     } catch (Exception e) {
307 | 
308 |                         log.warn("index {} out of boundary.", entry.getKey(), e);
309 |                     }
310 |                 }
311 | 
312 |                 break;
313 |             case JSON:
314 |             default:
315 |                 //将kafka的key加入node，统一处理
316 |                 try {
317 | 
318 |                     ObjectNode jsonNode = (ObjectNode) MAPPER.readTree(value);
319 | 
320 |                     jsonNode.put("kafka_key", key);
321 | 
322 |                     //将配置中指定的列值写入对应的列名下，列名包含了列簇名，形如：family:qualifier
323 |                     indexColumns.forEach((k, v) -> {
324 |                         if (jsonNode.get(k) != null) {
325 |                             keyValues.put(v, jsonNode.get(k).asText());
326 |                         }
327 |                     });
328 |                 } catch (IOException e) {
329 |                     keyValues.clear();
330 |                     indexColumns.forEach((k, v) -> keyValues.put(v, null));
331 |                     String partition = node.get("metadata").get("partition").asText();
332 |                     String offset = node.get("metadata").get("offset").asText();
333 |                     String topic = node.get("metadata").get("topic").asText();
334 |                     log.warn("this json record failed.topic->{},partition->{},offset->{},value->{}", topic, partition, offset, value);
335 |                 }
336 |                 break;
337 |         }
338 |         return keyValues;
339 |     }
340 | 
341 |     /**
342 |      * 执行频率和{@link FlinkRunner}中指定的checkpoint间隔一致
343 |      *
344 |      * @param context
345 |      * @throws Exception
346 |      */
347 |     @Override
348 |     public void snapshotState(FunctionSnapshotContext context) throws Exception {
349 |         for (ObjectNode element : nodes) {
350 |             checkpointedState.add(element);
351 |         }
352 |         log.debug("execute snapshot at {}", System.currentTimeMillis());
353 |     }
354 | 
355 |     /**
356 |      * 在程序内部出错重启时，如果调用了snapshotState方法，则会恢复checkpointedState中的数据，如果是手动cancel或重试几次失败后重新提交任务，此时
357 |      * 的checkpointedState会是新的对象，里面没有数据
358 |      *
359 |      * @param context
360 |      * @throws Exception
361 |      */
362 |     @Override
363 |     public void initializeState(FunctionInitializationContext context) throws Exception {
364 |         ListStateDescriptor<ObjectNode> descriptor =
365 |                 new ListStateDescriptor<>(
366 |                         "hbase-sink-cp",
367 |                         TypeInformation.of(new TypeHint<ObjectNode>() {
368 |                         }));
369 | 
370 |         checkpointedState = context.getOperatorStateStore().getListState(descriptor);
371 | 
372 |         if (context.isRestored()) {
373 |             for (ObjectNode element : checkpointedState.get()) {
374 |                 nodes.add(element);
375 |             }
376 |         }
377 |         log.info("initialState {}  record", nodes.size());
378 |     }
379 | }
380 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/common/CustomJsonDeserializationSchema.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.flinksinkhbase.common;
 2 | 
 3 | import org.apache.flink.api.common.typeinfo.TypeInformation;
 4 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
 5 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode;
 6 | import org.apache.flink.streaming.util.serialization.KeyedDeserializationSchema;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | import static org.apache.flink.api.java.typeutils.TypeExtractor.getForClass;
11 | 
12 | /**
13 |  * \* @author singgel
14 |  * \* @created_at: 2019/3/24 下午2:22
15 |  * \
16 |  */
17 | public class CustomJsonDeserializationSchema implements KeyedDeserializationSchema<ObjectNode> {
18 | 
19 |     private final boolean includeMetadata;
20 |     private ObjectMapper mapper;
21 | 
22 |     public CustomJsonDeserializationSchema(boolean includeMetadata) {
23 |         this.includeMetadata = includeMetadata;
24 |     }
25 | 
26 |     @Override
27 |     public ObjectNode deserialize(byte[] messageKey, byte[] message, String topic, int partition, long offset) throws IOException {
28 |         if (mapper == null) {
29 |             mapper = new ObjectMapper();
30 |         }
31 |         ObjectNode node = mapper.createObjectNode();
32 |         if (messageKey != null) {
33 |             node.put("key", new String(messageKey, "utf-8"));
34 |         }
35 |         if (message != null) {
36 |             node.put("value", new String(message, "utf-8"));
37 |         }
38 |         if (includeMetadata) {
39 |             node.putObject("metadata")
40 |                     .put("offset", offset)
41 |                     .put("topic", topic)
42 |                     .put("partition", partition);
43 |         }
44 |         return node;
45 |     }
46 | 
47 |     @Override
48 |     public boolean isEndOfStream(ObjectNode nextElement) {
49 |         return false;
50 |     }
51 | 
52 |     @Override
53 |     public TypeInformation<ObjectNode> getProducedType() {
54 |         return getForClass(ObjectNode.class);
55 |     }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/common/HbaseUtil.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.flinksinkhbase.common;
  2 | 
  3 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig;
  4 | import lombok.extern.slf4j.Slf4j;
  5 | import org.apache.hadoop.conf.Configuration;
  6 | import org.apache.hadoop.hbase.HBaseConfiguration;
  7 | import org.apache.hadoop.hbase.HColumnDescriptor;
  8 | import org.apache.hadoop.hbase.HTableDescriptor;
  9 | import org.apache.hadoop.hbase.TableName;
 10 | import org.apache.hadoop.hbase.client.*;
 11 | import org.slf4j.Logger;
 12 | import org.slf4j.LoggerFactory;
 13 | 
 14 | import java.io.IOException;
 15 | import java.io.Serializable;
 16 | import java.util.ArrayList;
 17 | import java.util.List;
 18 | 
 19 | /**
 20 |  * \* @author singgel
 21 |  * \* @created_at: 2019/3/24 下午1:48
 22 |  * \
 23 |  */
 24 | @Slf4j
 25 | public class HbaseUtil implements Serializable {
 26 | 
 27 | 
 28 |     private static Logger logger = LoggerFactory.getLogger(HbaseUtil.class);
 29 | 
 30 |     private Configuration configuration;
 31 |     private Connection connection;
 32 | 
 33 |     public HbaseUtil(HbaseConfig hbaseConfig) {
 34 |         this.configuration = HBaseConfiguration.create();
 35 |         this.configuration.set("hbase.zookeeper.quorum", hbaseConfig.getZookerperQuorum());
 36 |         this.configuration.set("hbase.zookeeper.property.clientPort", hbaseConfig.getPort());
 37 |         this.configuration.set("zookeeper.znode.parent", hbaseConfig.getZookeeperZondeParent());
 38 |         hbaseConfig.getOptionalProp().forEach((k, v) -> this.configuration.set(k, v));
 39 |         try {
 40 |             connection = ConnectionFactory.createConnection(configuration);
 41 |         } catch (IOException e) {
 42 |             e.printStackTrace();
 43 |         }
 44 |     }
 45 | 
 46 |     public Connection getConnection() {
 47 |         return connection;
 48 |     }
 49 | 
 50 |     /**
 51 |      * 获取某个rowKey的所有列簇所有列值
 52 |      *
 53 |      * @param tableName hbase表名
 54 |      * @param get       只指定了rowKey的get
 55 |      * @return 返回result
 56 |      */
 57 |     public Result singleGet(String tableName, Get get) throws IOException {
 58 | 
 59 |         Result result = null;
 60 |         try (Table table = connection.getTable(TableName.valueOf(tableName))) {
 61 |             result = table.get(get);
 62 | 
 63 |         } catch (IOException e) {
 64 |             log.error("singleGet rowKey:{} get failed", new String(get.getRow()), e);
 65 |             throw e;
 66 |         }
 67 |         return result;
 68 |     }
 69 | 
 70 |     /**
 71 |      * 批量获取
 72 |      *
 73 |      * @param tableName 表名
 74 |      * @param gets      get列表
 75 |      * @return
 76 |      */
 77 |     public Result[] batchGet(String tableName, List<Get> gets) throws IOException {
 78 |         Result[] results = null;
 79 |         try (Table table = connection.getTable(TableName.valueOf(tableName))) {
 80 |             results = table.get(gets);
 81 | 
 82 |         } catch (IOException e) {
 83 |             logger.warn("batchGets get failed", e);
 84 |             throw e;
 85 |         }
 86 |         return results;
 87 |     }
 88 | 
 89 | 
 90 |     /**
 91 |      * 向hbase表插入数据
 92 |      *
 93 |      * @param tableName hbase表名
 94 |      * @param put       要插入的put，需指定列簇和列
 95 |      */
 96 |     public void putData(String tableName, Put put) {
 97 |         System.out.println("begin put");
 98 |         try (Table table = connection.getTable(TableName.valueOf(tableName))) {
 99 |             table.put(put);
100 |             System.out.println("put success");
101 |         } catch (IOException e) {
102 |             logger.warn("rowKey:{} put failed", new String(put.getRow()), e);
103 |         }
104 |     }
105 | 
106 |     /**
107 |      * 向hbase表批量插入数据
108 |      *
109 |      * @param tableName hbase表名
110 |      * @param puts      要插入的puts，需指定列簇和列
111 |      */
112 |     public void putBatchData(String tableName, List<Put> puts) throws IOException {
113 |         try (Table table = connection.getTable(TableName.valueOf(tableName))) {
114 |             table.put(puts);
115 |         } catch (IOException e) {
116 |             log.error("put batch data failed",e);
117 |             throw e;
118 |         }
119 |     }
120 | 
121 |     /**
122 |      * 准备要写入的hbase表，如果表不存在则创建，并添加列簇，如果存在则添加不存在的列簇
123 |      *
124 |      * @param tableName hbase表名
125 |      * @param families  写入的列簇
126 |      * @throws IOException
127 |      */
128 |     public void prepareTable(String tableName, Iterable<String> families) throws IOException {
129 |         try {
130 |             HBaseAdmin admin = (HBaseAdmin) connection.getAdmin();
131 |             if (admin.tableExists(tableName)) {
132 |                 Table table = connection.getTable(TableName.valueOf(tableName));
133 |                 HTableDescriptor hTableDescriptor = table.getTableDescriptor();
134 |                 List<String> existFamilies = new ArrayList<>();
135 |                 List<String> needAddedFamilies = new ArrayList<>();
136 |                 for (HColumnDescriptor fdescriptor : hTableDescriptor.getColumnFamilies()) {
137 |                     existFamilies.add(fdescriptor.getNameAsString());
138 |                 }
139 |                 for (String family : families) {
140 |                     if (!existFamilies.contains(family)) {
141 |                         needAddedFamilies.add(family);
142 |                     }
143 |                 }
144 |                 //当有需要新增的列簇时再disable table,增加列簇
145 |                 if (needAddedFamilies.size() > 0) {
146 |                     admin.disableTable(tableName);
147 |                     for (String family : needAddedFamilies) {
148 |                         admin.addColumn(tableName, new HColumnDescriptor(family));
149 |                     }
150 |                     admin.enableTable(tableName);
151 |                 }
152 |             } else {
153 |                 HTableDescriptor hTableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));
154 |                 for (String family : families) {
155 |                     hTableDescriptor.addFamily(new HColumnDescriptor(family));
156 |                 }
157 |                 admin.createTable(hTableDescriptor);
158 |             }
159 |             admin.close();
160 |             connection.close();
161 |         } catch (IOException e) {
162 |             log.error("prepare table failed! check the table and columnFamilies.",e);
163 |             throw e;
164 |         }
165 | 
166 | 
167 |     }
168 | 
169 | }
170 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/common/JobConfigManager.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.flinksinkhbase.common;
 2 | 
 3 | import com.fasterxml.jackson.databind.ObjectMapper;
 4 | import com.typesafe.config.ConfigFactory;
 5 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig;
 6 | 
 7 | import java.io.BufferedReader;
 8 | import java.io.IOException;
 9 | import java.io.InputStreamReader;
10 | import java.net.HttpURLConnection;
11 | import java.net.URL;
12 | 
13 | /**
14 |  * \* @author singgel
15 |  * \* @created_at: 2019/3/24 下午5:38
16 |  * \
17 |  */
18 | public class JobConfigManager {
19 | 
20 |     private static final ObjectMapper MAPPER = new ObjectMapper();
21 |     private static final String URL = ConfigFactory.load().getConfig("apollo").getString("url");
22 | 
23 |     /**
24 |      * 从Apollo配置中心获取jobConfig的配置
25 |      *
26 |      * @param key 配置的key
27 |      * @return JobConfig
28 |      * @throws IOException
29 |      */
30 |     public static JobConfig getConfigByKey(String key) throws Exception {
31 | 
32 |         java.net.URL url = new URL(String.format("%s/apollo/getConf?key=%s",URL, key));
33 |         HttpURLConnection con = (HttpURLConnection) url.openConnection();
34 | 
35 |         con.setRequestMethod("GET");
36 |         int responseCode = con.getResponseCode();
37 |         System.out.println("\nSending 'GET' request to URLSTR : " + url);
38 |         System.out.println("Response Code : " + responseCode);
39 | 
40 |         BufferedReader in = new BufferedReader(
41 |                 new InputStreamReader(con.getInputStream()));
42 |         String inputLine;
43 |         StringBuffer response = new StringBuffer();
44 | 
45 |         while ((inputLine = in.readLine()) != null) {
46 |             response.append(inputLine);
47 |         }
48 |         in.close();
49 | 
50 |         //打印结果
51 |         System.out.println(response.toString());
52 |         String jsonRet = MAPPER.readTree(response.toString()).get("data").asText();
53 |         JobConfig jobConfig = MAPPER.readValue(jsonRet, JobConfig.class);
54 | 
55 |         return jobConfig;
56 | 
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/common/JoinTable.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.flinksinkhbase.common;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Map;
 5 | 
 6 | /**
 7 |  * \* @author singgel
 8 |  * \* @created_at: 2019/3/24 下午5:03
 9 |  * \
10 |  */
11 | 
12 | /**
13 |  * 需要join的Hbase表，按照主表中的某列值作为此表中的rowKey,获取此表此rowKey的相关列值，插入到主表对应的列，完成join
14 |  */
15 | public class JoinTable implements Serializable{
16 | 
17 |     /**
18 |      * hbase 表名
19 |      */
20 |     private String tableName;
21 | 
22 |     /**
23 |      * 和此表RowKey相关联的主表列名
24 |      */
25 |     private String joinKey;
26 | 
27 |     /**
28 |      * 此表中列簇和列的对应关系
29 |      */
30 | 
31 |     /**
32 |      * 此表中列和列簇及写入到主表中的列簇的对应关系,如：
33 |      * key-> fromFamily:fromColumn
34 |      * value -> toFamily:toColumn
35 |      */
36 |     private Map<String, String> columnsMapping;
37 | 
38 | 
39 |     public String getTableName() {
40 |         return tableName;
41 |     }
42 | 
43 |     public void setTableName(String tableName) {
44 |         this.tableName = tableName;
45 |     }
46 | 
47 |     public String getJoinKey() {
48 |         return joinKey;
49 |     }
50 | 
51 |     public void setJoinKey(String joinKey) {
52 |         this.joinKey = joinKey;
53 |     }
54 | 
55 |     public Map<String, String> getColumnsMapping() {
56 |         return columnsMapping;
57 |     }
58 | 
59 |     public void setColumnsMapping(Map<String, String> columnsMapping) {
60 |         this.columnsMapping = columnsMapping;
61 |     }
62 | 
63 |     public JoinTable() {
64 |     }
65 | 
66 |     public JoinTable(String tableName, String joinKey, Map<String, String> columnsMapping) {
67 |         this.tableName = tableName;
68 |         this.joinKey = joinKey;
69 |         this.columnsMapping = columnsMapping;
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/common/ValueFormat.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.flinksinkhbase.common;
 2 | 
 3 | /**
 4 |  * kafka消息值的类型格式
 5 |  */
 6 | public enum ValueFormat {
 7 |     /**
 8 |      * CSV格式，以固定分隔符分割
 9 |      */
10 |     CSV,
11 | 
12 |     /**
13 |      * ObjecNode的json格式
14 |      */
15 |     JSON
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/config/HbaseConfig.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.flinksinkhbase.config;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.Map;
  5 | 
  6 | /**
  7 |  * \* @author singgel
  8 |  * \* @created_at: 2019/3/28 上午9:29
  9 |  * \
 10 |  */
 11 | public class HbaseConfig implements Serializable{
 12 | 
 13 |     /**
 14 |      * zookeeper机器名，多个用逗号连接
 15 |      */
 16 |     private String  zookerperQuorum;
 17 | 
 18 |     /**
 19 |      * 端口，默认2181
 20 |      */
 21 |     private String port = "2181";
 22 | 
 23 |     /**
 24 |      * hbase在zookeeper节点的路径
 25 |      */
 26 |     private String zookeeperZondeParent;
 27 | 
 28 |     /**
 29 |      * 其它非必需配置
 30 |      */
 31 |     private Map<String,String> optionalProp;
 32 | 
 33 |     /**
 34 |      * 批量写入时每批的数量
 35 |      */
 36 |     private int batchCount;
 37 | 
 38 |     /**
 39 |      * 每批次的时间间隔，单位：毫秒
 40 |      */
 41 |     private long interval;
 42 | 
 43 |     public HbaseConfig() {
 44 |     }
 45 | 
 46 |     public HbaseConfig(String zookerperQuorum,
 47 |                        String port,
 48 |                        String zookeeperZondeParent,
 49 |                        int batchCount,
 50 |                        long interval,
 51 |                        Map<String,String> optionalProp) {
 52 |         this.zookerperQuorum = zookerperQuorum;
 53 |         this.port = port;
 54 |         this.zookeeperZondeParent = zookeeperZondeParent;
 55 |         this.batchCount = batchCount;
 56 |         this.interval = interval;
 57 |         this.optionalProp =optionalProp;
 58 |     }
 59 | 
 60 |     public String getZookerperQuorum() {
 61 |         return zookerperQuorum;
 62 |     }
 63 | 
 64 |     public void setZookerperQuorum(String zookerperQuorum) {
 65 |         this.zookerperQuorum = zookerperQuorum;
 66 |     }
 67 | 
 68 |     public String getPort() {
 69 |         return port;
 70 |     }
 71 | 
 72 |     public void setPort(String port) {
 73 |         this.port = port;
 74 |     }
 75 | 
 76 |     public String getZookeeperZondeParent() {
 77 |         return zookeeperZondeParent;
 78 |     }
 79 | 
 80 |     public void setZookeeperZondeParent(String zookeeperZondeParent) {
 81 |         this.zookeeperZondeParent = zookeeperZondeParent;
 82 |     }
 83 | 
 84 |     public int getBatchCount() {
 85 |         return batchCount;
 86 |     }
 87 | 
 88 |     public void setBatchCount(int batchCount) {
 89 |         this.batchCount = batchCount;
 90 |     }
 91 | 
 92 |     public long getInterval() {
 93 |         return interval;
 94 |     }
 95 | 
 96 |     public void setInterval(long interval) {
 97 |         this.interval = interval;
 98 |     }
 99 | 
100 |     public Map<String, String> getOptionalProp() {
101 |         return optionalProp;
102 |     }
103 | 
104 |     public void setOptionalProp(Map<String, String> optionalProp) {
105 |         this.optionalProp = optionalProp;
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/config/JobConfig.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.flinksinkhbase.config;
  2 | 
  3 | 
  4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable;
  5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat;
  6 | import org.apache.commons.lang3.StringUtils;
  7 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.core.JsonProcessingException;
  8 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
  9 | 
 10 | import java.io.Serializable;
 11 | import java.util.*;
 12 | import java.util.stream.Collectors;
 13 | 
 14 | /**
 15 |  * \* @author singgel
 16 |  * \* @created_at: 2019/3/24 下午1:50
 17 |  * \
 18 |  */
 19 | public class JobConfig implements Serializable {
 20 | 
 21 |     private static final ObjectMapper MAPPER = new ObjectMapper();
 22 | 
 23 |     public static final long CHECKPOINT_INTERVAR = 10000L;
 24 | 
 25 |     /**
 26 |      * kafka消息(整合key和value的值)按分隔符分隔后index和列信息的对应关系
 27 |      */
 28 |     private Map<String, String> indexColumnMapping = new HashMap<>();
 29 | 
 30 | 
 31 |     /**
 32 |      * 多列组合成rowKey时指定的连接字符
 33 |      */
 34 |     private String rowKeyDelimiter;
 35 | 
 36 |     /**
 37 |      * 组成rowKey的列
 38 |      */
 39 |     private List<String> rowKeyColumns;
 40 | 
 41 |     /**
 42 |      * 要写入hbase表的表名，称为"主表"
 43 |      */
 44 |     private String tableName;
 45 | 
 46 |     /**
 47 |      * kafka源配置
 48 |      */
 49 |     private KafkaConfig kafkaConfig;
 50 | 
 51 | 
 52 |     /**
 53 |      * hbase基本配置
 54 |      */
 55 |     private HbaseConfig hbaseConfig;
 56 | 
 57 |     /**
 58 |      * flink job名字，同apollo配置的key
 59 |      */
 60 |     private String jobName;
 61 | 
 62 |     /**
 63 |      * 通过实时平台前端界面{@link \http://10.10.20.81:7878/realtime/platform/jobList} 启动flink任务时指定的并发数
 64 |      */
 65 |     private int parallelism;
 66 | 
 67 |     /**
 68 |      * 通过实时平台前端界面{@link \http://10.10.20.81:7878/realtime/platform/jobList} 启动flink任务时指定的jar包名称
 69 |      */
 70 |     private String jarName;
 71 | 
 72 |     public JobConfig() {
 73 |     }
 74 | 
 75 |     public JobConfig(Map<String, String> indexColumnMapping,
 76 |                      String rowKeyDelimiter,
 77 |                      List<String> rowKeyColumns,
 78 |                      String tableName,
 79 |                      KafkaConfig kafkaConfig,
 80 |                      HbaseConfig hbaseConfig,
 81 |                      List<JoinTable> joinTables,
 82 |                      String jobName,
 83 |                      int parallelism,
 84 |                      String jarName) {
 85 |         this.indexColumnMapping = indexColumnMapping;
 86 |         this.rowKeyDelimiter = rowKeyDelimiter;
 87 |         this.rowKeyColumns = rowKeyColumns;
 88 |         this.tableName = tableName;
 89 |         this.kafkaConfig = kafkaConfig;
 90 |         this.hbaseConfig = hbaseConfig;
 91 |         this.joinTables = joinTables;
 92 |         this.jobName = jobName;
 93 |         this.parallelism = parallelism;
 94 |         this.jarName = jarName;
 95 |     }
 96 | 
 97 | 
 98 |     /**
 99 |      * 校验:
100 |      * 1. joinTable中的joinKey需要在主表的列中存在
101 |      * 2. indexColumnMapping的values不能重复
102 |      * 3. joinTable中需要写入到mainTable中的列不能与mainTable原有的列重合
103 |      * 4. 当valueFormat=CSV时，delimiter必须要指定
104 |      * 5. hbase批量写入间隔要不能大于flink任务checkpoint的间隔
105 |      */
106 |     public void validate() throws IllegalArgumentException {
107 |         for (JoinTable joinTable : this.joinTables) {
108 |             if (indexColumnMapping.values().stream().noneMatch(e -> e.contains(joinTable.getJoinKey()))) {
109 |                 throw new IllegalArgumentException(String.format("%s does not exist in the columns of main table %s", joinTable.getJoinKey(), tableName));
110 |             }
111 |         }
112 |         if (indexColumnMapping.keySet().size() != indexColumnMapping.values().size()) {
113 |             throw new IllegalArgumentException(String.format("the column in a family must not be duplicate"));
114 |         }
115 |         for (JoinTable joinTable : joinTables) {
116 |             if (joinTable.getColumnsMapping().values().stream().anyMatch(e -> indexColumnMapping.values().contains(e))) {
117 |                 throw new IllegalArgumentException(String.format("some column in joinTable:%s has existed in mainTable:%s. Please check!", joinTable, tableName));
118 |             }
119 |         }
120 |         if (kafkaConfig.getValueFormat() == ValueFormat.CSV && StringUtils.isEmpty(kafkaConfig.getDelimiter())) {
121 |             throw new IllegalArgumentException(String.format("the delimiter must be given when the valueFormat is CSV"));
122 |         }
123 |         if(hbaseConfig.getInterval()>CHECKPOINT_INTERVAR){
124 |             hbaseConfig.setInterval(CHECKPOINT_INTERVAR);
125 |         }
126 | 
127 |     }
128 | 
129 |     /**
130 |      * 获取mainTable需要写入的列簇
131 |      *
132 |      * @return 列簇的集合
133 |      */
134 |     public Set<String> families() {
135 |         Set<String> mainTableFamilies = this.indexColumnMapping.values().stream().map(e -> e.split(":")[0]).collect(Collectors.toSet());
136 |         for (JoinTable joinTable : joinTables) {
137 |             mainTableFamilies.addAll(joinTable.getColumnsMapping().values().stream().map(e -> e.split(":")[0]).collect(Collectors.toSet()));
138 |         }
139 |         return mainTableFamilies;
140 |     }
141 | 
142 | 
143 |     public Map<String, String> getIndexColumnMapping() {
144 |         return indexColumnMapping;
145 |     }
146 | 
147 |     public void setIndexColumnMapping(Map<String, String> indexColumnMapping) {
148 |         this.indexColumnMapping = indexColumnMapping;
149 |     }
150 | 
151 | 
152 |     private List<JoinTable> joinTables = new ArrayList<>();
153 | 
154 |     public String getRowKeyDelimiter() {
155 |         return rowKeyDelimiter;
156 |     }
157 | 
158 |     public void setRowKeyDelimiter(String rowKeyDelimiter) {
159 |         this.rowKeyDelimiter = rowKeyDelimiter;
160 |     }
161 | 
162 |     public List<String> getRowKeyColumns() {
163 |         return rowKeyColumns;
164 |     }
165 | 
166 |     public List<JoinTable> getJoinTables() {
167 |         return joinTables;
168 |     }
169 | 
170 |     public String getTableName() {
171 | 
172 |         return tableName;
173 |     }
174 | 
175 |     public KafkaConfig getKafkaConfig() {
176 |         return kafkaConfig;
177 |     }
178 | 
179 |     public void setKafkaConfig(KafkaConfig kafkaConfig) {
180 |         this.kafkaConfig = kafkaConfig;
181 |     }
182 | 
183 |     public HbaseConfig getHbaseConfig() {
184 |         return hbaseConfig;
185 |     }
186 | 
187 |     public void setHbaseConfig(HbaseConfig hbaseConfig) {
188 |         this.hbaseConfig = hbaseConfig;
189 |     }
190 | 
191 |     public void setTableName(String tableName) {
192 |         this.tableName = tableName;
193 |     }
194 | 
195 |     public void setRowKeyColumns(List<String> rowKeyColumns) {
196 |         this.rowKeyColumns = rowKeyColumns;
197 |     }
198 | 
199 |     public void setJoinTables(List<JoinTable> joinTables) {
200 |         this.joinTables = joinTables;
201 |     }
202 | 
203 |     public String getJobName() {
204 |         return jobName;
205 |     }
206 | 
207 |     public void setJobName(String jobName) {
208 |         this.jobName = jobName;
209 |     }
210 | 
211 |     public int getParallelism() {
212 |         return parallelism;
213 |     }
214 | 
215 |     public void setParallelism(int parallelism) {
216 |         this.parallelism = parallelism;
217 |     }
218 | 
219 |     public String getJarName() {
220 |         return jarName;
221 |     }
222 | 
223 |     public void setJarName(String jarName) {
224 |         this.jarName = jarName;
225 |     }
226 | 
227 |     public static long getCheckpointIntervar() {
228 |         return CHECKPOINT_INTERVAR;
229 |     }
230 | 
231 |     @Override
232 |     public String toString() {
233 |         String json = null;
234 |         try {
235 |             json = MAPPER.writeValueAsString(this);
236 |         } catch (JsonProcessingException e) {
237 |             e.printStackTrace();
238 |         }
239 |         return json;
240 |     }
241 | 
242 | }
243 | 


--------------------------------------------------------------------------------
/src/main/java/com/singgel/bigdata/flinksinkhbase/config/KafkaConfig.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.flinksinkhbase.config;
  2 | 
  3 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat;
  4 | 
  5 | import java.io.Serializable;
  6 | import java.util.Map;
  7 | import java.util.Properties;
  8 | 
  9 | /**
 10 |  * \* @author singgel
 11 |  * \* @created_at: 2019/3/24 下午5:21
 12 |  * \
 13 |  */
 14 | public class KafkaConfig implements Serializable {
 15 | 
 16 |     /**
 17 |      * 数据源kafka的连接
 18 |      */
 19 |     private String bootstrapServers;
 20 | 
 21 |     /**
 22 |      * 数据源的topic
 23 |      */
 24 |     private String topic;
 25 | 
 26 |     /**
 27 |      * 消费topic的groupId
 28 |      */
 29 |     private String groupId;
 30 | 
 31 |     /**
 32 |      * kafka value的类型，目前支持：CSV和JSON
 33 |      */
 34 |     private ValueFormat valueFormat;
 35 | 
 36 |     /**
 37 |      * kafka消息value的分割符，当valueFormat为CSV格式时必须指定
 38 |      */
 39 |     private String delimiter;
 40 | 
 41 |     /**
 42 |      * 其它配置
 43 |      */
 44 |     private Map<String, String> optionalProps;
 45 | 
 46 | 
 47 |     public KafkaConfig() {
 48 |     }
 49 | 
 50 | 
 51 |     public KafkaConfig(String bootstrapServers, String topic, String groupId,ValueFormat valueFormat, String splitor, Map<String, String> optionalProps) {
 52 |         this.bootstrapServers = bootstrapServers;
 53 |         this.topic = topic;
 54 |         this.groupId = groupId;
 55 |         this.valueFormat = valueFormat;
 56 |         this.delimiter = splitor;
 57 |         this.optionalProps = optionalProps;
 58 |     }
 59 | 
 60 |     public String getBootstrapServers() {
 61 |         return bootstrapServers;
 62 |     }
 63 | 
 64 |     public void setBootstrapServers(String bootstrapServers) {
 65 |         this.bootstrapServers = bootstrapServers;
 66 |     }
 67 | 
 68 |     public String getTopic() {
 69 |         return topic;
 70 |     }
 71 | 
 72 |     public void setTopic(String topic) {
 73 |         this.topic = topic;
 74 |     }
 75 | 
 76 |     public String getGroupId() {
 77 |         return groupId;
 78 |     }
 79 | 
 80 |     public void setGroupId(String groupId) {
 81 |         this.groupId = groupId;
 82 |     }
 83 | 
 84 |     public ValueFormat getValueFormat() {
 85 |         return valueFormat;
 86 |     }
 87 | 
 88 |     public void setValueFormat(ValueFormat valueFormat) {
 89 |         this.valueFormat = valueFormat;
 90 |     }
 91 | 
 92 |     public String getDelimiter() {
 93 |         return delimiter;
 94 |     }
 95 | 
 96 |     public void setDelimiter(String delimiter) {
 97 |         this.delimiter = delimiter;
 98 |     }
 99 | 
100 |     public Map<String, String> getOptionalProps() {
101 |         return optionalProps;
102 |     }
103 | 
104 |     public void setOptionalProps(Map<String, String> optionalProps) {
105 |         this.optionalProps = optionalProps;
106 |     }
107 | 
108 | 
109 |     public Properties kafkaProps() {
110 |         Properties props = new Properties();
111 |         props.setProperty("bootstrap.servers", this.bootstrapServers);
112 |         props.setProperty("group.id", this.groupId);
113 |         optionalProps.forEach(props::setProperty);
114 |         return props;
115 |     }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/resources/application.conf:
--------------------------------------------------------------------------------
1 | apollo {
2 |   url = "http://singgel:8080"
3 | }


--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 | 
 4 | 
 5 |     <appender name="STDOUT"
 6 |               class="ch.qos.logback.core.ConsoleAppender">
 7 |         <encoder>
 8 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS}|%level|%logger{1}|%msg%n
 9 |             </pattern>
10 |             <charset>UTF-8</charset>
11 |         </encoder>
12 |     </appender>
13 | 
14 |     <logger name="com.singgel" level="INFO" additivity="false">
15 |     <appender-ref ref="STDOUT" />
16 |     </logger>
17 | 
18 |     <root level="INFO">
19 |         <appender-ref ref="STDOUT"/>
20 |     </root>
21 | </configuration>


--------------------------------------------------------------------------------
/src/test/java/com/singgel/bigdata/recommend/HbaseByteTest.java:
--------------------------------------------------------------------------------
 1 | package com.singgel.bigdata.recommend;
 2 | 
 3 | import junit.framework.TestCase;
 4 | import org.apache.hadoop.hbase.util.Bytes;
 5 | 
 6 | /**
 7 |  * \* @author singgel
 8 |  * \* @created_at: 2019/3/28 上午10:37
 9 |  * \
10 |  */
11 | public class HbaseByteTest extends TestCase{
12 | 
13 |     public void testByte(){
14 |         String userIdStr = "124235435235.4";
15 |         Long userIdLong = 124235435235L;
16 | 
17 | 
18 |         byte[] strByte = Bytes.toBytes(userIdStr);
19 |         byte[] longByte = Bytes.toBytes(userIdLong);
20 | 
21 |         System.out.println(Bytes.toString(strByte));
22 | 
23 |         //long型写入的，读成string型，会乱码
24 |         System.out.println(Bytes.toString(longByte));
25 | 
26 |         //string型写入的，读成long型，结果不对
27 |         System.out.println(Bytes.toLong(strByte));
28 | 
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/java/com/singgel/bigdata/recommend/HbaseGetTest.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.recommend;
  2 | 
  3 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig;
  4 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil;
  5 | import junit.framework.TestCase;
  6 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
  7 | import org.apache.hadoop.hbase.client.Get;
  8 | import org.apache.hadoop.hbase.util.Bytes;
  9 | 
 10 | import java.io.IOException;
 11 | import java.util.HashMap;
 12 | import java.util.Map;
 13 | 
 14 | /**
 15 |  * \* @author singgel
 16 |  * \* @created_at: 2019/4/2 下午5:33
 17 |  * \
 18 |  */
 19 | public class HbaseGetTest extends TestCase {
 20 | 
 21 |     public void testHbaseGet() throws IOException {
 22 | 
 23 |         String user = "{\n" +
 24 |                 "                \"basic:pagerank\": \"basic:pagerank\",\n" +
 25 |                 "                \"basic:country\": \"basic:country\",\n" +
 26 |                 "                \"basic:province\": \"basic:province\",\n" +
 27 |                 "                \"basic:city\": \"basic:city\",\n" +
 28 |                 "                \"basic:mobile\": \"basic:mobile\",\n" +
 29 |                 "                \"basic:follower_cluster\": \"basic:follower_cluster\",\n" +
 30 |                 "                \"basic:quality_cluster\": \"basic:quality_cluster\",\n" +
 31 |                 "                \"basic:symbol_cluster\": \"basic:symbol_cluster\",\n" +
 32 |                 "                \"basic:topic_cluster\": \"basic:topic_cluster\",\n" +
 33 |                 "                \"basic:stock_click7\": \"basic:stock_click7\",\n" +
 34 |                 "                \"basic:stock_show7\": \"basic:stock_show7\",\n" +
 35 |                 "                \"basic:stock_click30\": \"basic:stock_click30\",\n" +
 36 |                 "                \"basic:stock_show30\": \"basic:stock_show30\",\n" +
 37 |                 "                \"basic:symbol_page_enter\": \"basic:symbol_page_enter\",\n" +
 38 |                 "                \"basic:symbol_new_status\": \"basic:symbol_new_status\",\n" +
 39 |                 "                \"basic:symbol_hot\": \"basic:symbol_hot\",\n" +
 40 |                 "                \"basic:symbol_finance\": \"basic:symbol_finance\",\n" +
 41 |                 "                \"basic:symbol_news\": \"basic:symbol_news\",\n" +
 42 |                 "                \"basic:symbol_notice\": \"basic:symbol_notice\",\n" +
 43 |                 "                \"basic:symbol_general\": \"basic:symbol_general\",\n" +
 44 |                 "                \"basic:symbol_page_view\": \"basic:symbol_page_view\",\n" +
 45 |                 "                \"basic:symbol_page_origin\": \"basic:symbol_page_origin\",\n" +
 46 |                 "                \"basic:attention_mark\": \"basic:attention_mark\",\n" +
 47 |                 "                \"basic:rebalance_num\": \"basic:rebalance_num\",\n" +
 48 |                 "                \"basic:topic_personal_short_click\": \"basic:topic_personal_short_click\",\n" +
 49 |                 "                \"basic:topic_personal_short_show\": \"basic:topic_personal_short_show\",\n" +
 50 |                 "                \"basic:topic_personal_long_click\": \"basic:topic_personal_long_click\",\n" +
 51 |                 "                \"basic:topic_personal_long_show\": \"basic:topic_personal_long_show\",\n" +
 52 |                 "                \"basic:dislike_1st\": \"basic:dislike_1st\",\n" +
 53 |                 "                \"basic:dislike_2st\": \"basic:dislike_2st\",\n" +
 54 |                 "                \"basic:dislike_3st\": \"basic:dislike_3st\",\n" +
 55 |                 "                \"basic:dislike_4st\": \"basic:dislike_4st\",\n" +
 56 |                 "                \"basic:dislike_5st\": \"basic:dislike_5st\",\n" +
 57 |                 "                \"basic:familar_1st\": \"basic:familar_1st\",\n" +
 58 |                 "                \"basic:familar_2st\": \"basic:familar_2st\",\n" +
 59 |                 "                \"basic:familar_3st\": \"basic:familar_3st\",\n" +
 60 |                 "                \"basic:familar_4st\": \"basic:familar_4st\",\n" +
 61 |                 "                \"basic:familar_5st\": \"basic:familar_5st\",\n" +
 62 |                 "                \"basic:like_1st\": \"basic:like_1st\",\n" +
 63 |                 "                \"basic:like_2st\": \"basic:like_2st\",\n" +
 64 |                 "                \"basic:like_3st\": \"basic:like_3st\",\n" +
 65 |                 "                \"basic:like_4st\": \"basic:like_4st\",\n" +
 66 |                 "                \"basic:like_5st\": \"basic:like_5st\",\n" +
 67 |                 "                \"basic:unfamilar_1st\": \"basic:unfamilar_1st\",\n" +
 68 |                 "                \"basic:unfamilar_2st\": \"basic:unfamilar_2st\",\n" +
 69 |                 "                \"basic:unfamilar_3st\": \"basic:unfamilar_3st\",\n" +
 70 |                 "                \"basic:unfamilar_4st\": \"basic:unfamilar_4st\",\n" +
 71 |                 "                \"basic:unfamilar_5st\": \"basic:unfamilar_5st\",\n" +
 72 |                 "                \"basic:headline_down_cnt\": \"basic:headline_down_cnt\",\n" +
 73 |                 "                \"basic:headline_up_cnt\": \"basic:headline_up_cnt\",\n" +
 74 |                 "                \"basic:optional_cnt\": \"basic:optional_cnt\",\n" +
 75 |                 "                \"basic:dynamic_cnt\": \"basic:dynamic_cnt\",\n" +
 76 |                 "                \"basic:quotation_cnt\": \"basic:quotation_cnt\",\n" +
 77 |                 "                \"basic:base_rate\": \"basic:base_rate\",\n" +
 78 |                 "                \"basic:mark_gegu_enter\": \"basic:mark_gegu_enter\",\n" +
 79 |                 "                \"basic:mark_share_sum\": \"basic:mark_share_sum\",\n" +
 80 |                 "                \"basic:mark_head_dislike_sum\": \"basic:mark_head_dislike_sum\",\n" +
 81 |                 "                \"basic:mark_status_post_user_sum\": \"basic:mark_status_post_user_sum\",\n" +
 82 |                 "                \"basic:mark_search_sum\": \"basic:mark_search_sum\",\n" +
 83 |                 "                \"basic:mark_debate_post_user_num\": \"basic:mark_debate_post_user_num\",\n" +
 84 |                 "                \"basic:author_click_week\": \"basic:author_click_week\",\n" +
 85 |                 "                \"basic:author_show_week\": \"basic:author_show_week\",\n" +
 86 |                 "                \"basic:author_click_month\": \"basic:author_click_month\",\n" +
 87 |                 "                \"basic:author_show_month\": \"basic:author_show_month\"\n" +
 88 |                 "            }";
 89 | 
 90 |         ObjectMapper mapper = new ObjectMapper();
 91 |         Map<String, String> map = mapper.readValue(user, Map.class);
 92 | 
 93 |         String zookeeperQuorum = "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com";
 94 |         HbaseConfig hbaseConfig = new HbaseConfig(zookeeperQuorum, "2181", "/hbase-unsecure", 1, 0L, new HashMap<>());
 95 | 
 96 |         HbaseUtil hbaseUtil = new HbaseUtil(hbaseConfig);
 97 |         String[] uids = {"3148682933", "3188053557", "2912663770", "3227054543", "1492133910", "1275730031"};
 98 |         String[] statusIds = {"124616652","124650145","124458448","124628342","124386412","124382379","124303730","124479145","124580988","124331284"};
 99 | 
100 | //        long start1 = System.currentTimeMillis();
101 | //        for (String uid : uids) {
102 | //            Get get = new Get(Bytes.toBytes(uid));
103 | //            map.keySet().forEach(e -> {
104 | //                get.addFamily(Bytes.toBytes(e.split(":")[0]));
105 | //            });
106 | //            hbaseUtil.singleGet("user_feature", get);
107 | //        }
108 | //        long start2 = System.currentTimeMillis();
109 | //        for (String uid : uids) {
110 | //            Get get = new Get(Bytes.toBytes(uid));
111 | //            map.keySet().forEach(e -> {
112 | //                get.addColumn(Bytes.toBytes(e.split(":")[0]), Bytes.toBytes(e.split(":")[1]));
113 | //            });
114 | //            hbaseUtil.singleGet("user_feature", get);
115 | //        }
116 |         long start3 = System.currentTimeMillis();
117 | //        for (String uid : uids) {
118 | //            hbaseUtil.singleGet("user_feature", new Get(Bytes.toBytes(uid)));
119 | //        }
120 | 
121 |         for (String sid : statusIds) {
122 |             hbaseUtil.singleGet("status_feature_string", new Get(Bytes.toBytes(sid)));
123 |         }
124 | 
125 |         long start4 = System.currentTimeMillis();
126 | 
127 | //        System.out.println(String.format("get with given families cost: %d ms", start2 - start1));
128 | //        System.out.println(String.format("get with given columns cost: %d ms", start3 - start2));
129 |         System.out.println(String.format("get with no given columns cost: %d ms", start4 - start3));
130 | 
131 |     }
132 | }
133 | 


--------------------------------------------------------------------------------
/src/test/java/com/singgel/bigdata/recommend/JobConfigTest.java:
--------------------------------------------------------------------------------
  1 | package com.singgel.bigdata.recommend;
  2 | 
  3 | import com.singgel.bigdata.flinksinkhbase.common.HbaseUtil;
  4 | import com.singgel.bigdata.flinksinkhbase.common.JoinTable;
  5 | import com.singgel.bigdata.flinksinkhbase.common.ValueFormat;
  6 | import com.singgel.bigdata.flinksinkhbase.config.HbaseConfig;
  7 | import com.singgel.bigdata.flinksinkhbase.config.JobConfig;
  8 | import com.singgel.bigdata.flinksinkhbase.config.KafkaConfig;
  9 | import junit.framework.Assert;
 10 | import junit.framework.TestCase;
 11 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.ObjectMapper;
 12 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.JsonNodeFactory;
 13 | import org.apache.flink.shaded.jackson2.com.fasterxml.jackson.databind.node.ObjectNode;
 14 | import org.junit.Test;
 15 | 
 16 | import java.io.IOException;
 17 | import java.util.ArrayList;
 18 | import java.util.HashMap;
 19 | import java.util.List;
 20 | import java.util.Map;
 21 | 
 22 | /**
 23 |  * \* @author singgel
 24 |  * \* @created_at: 2019/3/25 上午11:40
 25 |  * \
 26 |  */
 27 | public class JobConfigTest extends TestCase {
 28 | 
 29 |     @Test
 30 |     public void testJobConfig() throws IOException {
 31 | 
 32 |         ObjectMapper mapper = new ObjectMapper();
 33 | 
 34 |         String input = "3521088992|1553845345473_1420|16|USER_NEW_STATUS|3|124029277|1|33|3|25|2|0|0|0.0|0.0|200.0|0.0|0.0|0.0|-1|0.0||223.104.4.109|移动|中国||0.0|timeline_status_tf_rerank_v1|1049|0.0|5747931945392156544|0.3354492783546448||4487|19264|0|0|0|0|3|#|EWC=3-GOOGL=1-RUSS=1-JEQ=1-ENZL=1-BA=1|n,n,0,0,359.0,334.0||330.61522521972654|47=20-42=19-55=14-24=5-58=5-62=4-28=2-64=2-66=1-57=1-25=1-48=1-61=1#1109218013=3-8806901933=3-9699237903=2-9383578508=2-7807472003=2-9725962662=2-1978838521=2-1558983019=2-9139068670=2-3826938159=2-5124430882=2-1077346177=1-1683324140=1-1383227981=1-3216693756=1-9688940470=1-1697559028=1-1703783922=1-6028781397=1-1250983240=1#=4-SZ002430=2-EWC=1-HUAW=1-EWY=1-SZ300436=1-BA=1#94_3=10-54_3=6-319_3=5-80_3=5-95_3=4-182_3=4-12_3=3-343_3=2-308_3=2-218_3=2-64_3=2-337_3=2-224_3=2-260_3=2-139_3=1-116_3=1-137_3=1-76_3=1-157_3=1-78_3=1#中国=3-飞机=2-成本=2-总理=2-合作=2-市场=2-技>术=2-季报=2-板块=2-状态=1-工业股=1-中美=1-金叉=1-牛市=1-妈妈=1-谈判=1-爸爸=1-新西兰=1-上证指数=1-工作=1|高手|";
 35 | 
 36 |         Map<String, String> columnsConfig = new HashMap<>();
 37 |         columnsConfig.put("0", "basic:time");
 38 |         columnsConfig.put("1", "basic:userId");
 39 |         columnsConfig.put("2", "basic:sessionId");
 40 |         columnsConfig.put("3", "basic:testMissionId");
 41 |         columnsConfig.put("4", "basic:strategeName");
 42 |         columnsConfig.put("5", "basic:statusType");
 43 |         columnsConfig.put("6", "basic:statusId");
 44 |         columnsConfig.put("7", "basic:position");
 45 |         columnsConfig.put("8", "basic:likeCount");
 46 |         columnsConfig.put("9", "basic:retweetCount");
 47 |         columnsConfig.put("10", "basic:replyCount");
 48 |         columnsConfig.put("20", "basic:tag");
 49 |         columnsConfig.put("22", "basic:stockSymbol");
 50 |         columnsConfig.put("31", "basic:randomId");
 51 |         columnsConfig.put("33", "basic:quoteString");
 52 |         columnsConfig.put("41", "basic:contextInfo");
 53 | 
 54 | 
 55 |         Map<String, String> userFeatureColumnMapping = new HashMap<>();
 56 |         userFeatureColumnMapping.put("basic:pagerank", "basic:pagerank");
 57 |         userFeatureColumnMapping.put("basic:country", "basic:country");
 58 |         userFeatureColumnMapping.put("basic:province", "basic:province");
 59 |         userFeatureColumnMapping.put("basic:city", "basic:city");
 60 |         userFeatureColumnMapping.put("basic:mobile", "basic:mobile");
 61 |         userFeatureColumnMapping.put("basic:follow_cluster", "basic:follow_cluster");
 62 |         userFeatureColumnMapping.put("basic:quality_cluster", "basic:quality_cluster");
 63 |         userFeatureColumnMapping.put("basic:symbol_cluster", "basic:symbol_cluster");
 64 | 
 65 |         JoinTable userFeature = new JoinTable("user_feature", "basic:userId", userFeatureColumnMapping);
 66 | 
 67 |         Map<String, String> statusFeatureColumnMapping = new HashMap<>();
 68 | 
 69 |         statusFeatureColumnMapping.put("basic:user_id", "basic:user_id");
 70 |         statusFeatureColumnMapping.put("basic:symbol_id", "basic:symbol_id");
 71 |         statusFeatureColumnMapping.put("basic:created_at", "basic:created_at");
 72 |         statusFeatureColumnMapping.put("basic:source", "basic:source");
 73 |         statusFeatureColumnMapping.put("basic:retweet_status_id", "basic:retweet_status_id");
 74 |         statusFeatureColumnMapping.put("basic:paid_mention_user_id", "basic:paid_mention_user_id");
 75 |         statusFeatureColumnMapping.put("basic:retweet_user_id", "basic:retweet_user_id");
 76 |         statusFeatureColumnMapping.put("basic:retweet_symbol_id", "basic:retweet_symbol_id");
 77 |         statusFeatureColumnMapping.put("basic:truncated", "basic:truncated");
 78 |         statusFeatureColumnMapping.put("basic:flags", "basic:flags");
 79 |         statusFeatureColumnMapping.put("basic:expired_at", "basic:expired_at");
 80 |         statusFeatureColumnMapping.put("basic:title_length", "basic:title_length");
 81 |         statusFeatureColumnMapping.put("basic:title_hash", "basic:title_hash");
 82 | 
 83 |         JoinTable statusFeature = new JoinTable("status_feature_string", "basic:statusId", statusFeatureColumnMapping);
 84 | 
 85 |         List<JoinTable> joinTables = new ArrayList<>();
 86 |         joinTables.add(userFeature);
 87 |         joinTables.add(statusFeature);
 88 | 
 89 |         String bootstrtapServers = "localhost:9092";
 90 |         String topic = "recommend2.statistics";
 91 |         String groupId = "flink_recommend2_statistic_test";
 92 |         ValueFormat valueFormat = ValueFormat.CSV;
 93 |         String delimiter = "|";
 94 |         KafkaConfig kafkaConfig = new KafkaConfig(bootstrtapServers, topic, groupId, valueFormat, delimiter, new HashMap<>());
 95 |         String zookeeperQuorum = "singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com";
 96 |         HbaseConfig hbaseConfig = new HbaseConfig(zookeeperQuorum, "2181", "/hbase-unsecure", 1, 0L, new HashMap<>());
 97 | 
 98 |         String rowKeyDelimiter = "#";
 99 |         List<String> rowKeyColumns = new ArrayList<>();
100 |         rowKeyColumns.add("basic:userId");
101 |         rowKeyColumns.add("basic:statusId");
102 |         String tableName = "test";
103 |         String jobName = "recommend_feature_hbase";
104 |         JobConfig jobConfig = new JobConfig(columnsConfig, rowKeyDelimiter, rowKeyColumns, tableName, kafkaConfig, hbaseConfig, joinTables, jobName, 2, "");
105 | 
106 |         String jobConfigJosn = jobConfig.toString();
107 | 
108 |         String expected = "{\"indexColumnMapping\":{\"22\":\"basic:stockSymbol\",\"33\":\"basic:quoteString\",\"0\":\"basic:time\",\"1\":\"basic:userId\",\"2\":\"basic:sessionId\",\"3\":\"basic:testMissionId\",\"4\":\"basic:strategeName\",\"5\":\"basic:statusType\",\"6\":\"basic:statusId\",\"7\":\"basic:position\",\"8\":\"basic:likeCount\",\"9\":\"basic:retweetCount\",\"41\":\"basic:contextInfo\",\"20\":\"basic:tag\",\"31\":\"basic:randomId\",\"10\":\"basic:replyCount\"},\"rowKeyDelimiter\":\"#\",\"rowKeyColumns\":[\"basic:userId\",\"basic:statusId\"],\"tableName\":\"test\",\"kafkaConfig\":{\"bootstrapServers\":\"localhost:9092\",\"topic\":\"recommend2.statistics\",\"groupId\":\"flink_recommend2_statistic_test\",\"valueFormat\":\"CSV\",\"delimiter\":\"|\",\"optionalProps\":{}},\"hbaseConfig\":{\"zookerperQuorum\":\"singgel-53-3.inter.singgel.com,singgel-53-4.inter.singgel.com,singgel-53-5.inter.singgel.com,singgel-53-6.inter.singgel.com,singgel-54-3.inter.singgel.com,singgel-54-4.inter.singgel.com,singgel-54-5.inter.singgel.com,singgel-54-6.inter.singgel.com\",\"port\":\"2181\",\"zookeeperZondeParent\":\"/hbase-unsecure\",\"optionalProp\":{},\"batchCount\":1,\"interval\":0},\"jobName\":\"recommend_feature_hbase\",\"parallelism\":2,\"jarName\":\"\",\"joinTables\":[{\"tableName\":\"user_feature\",\"joinKey\":\"basic:userId\",\"columnsMapping\":{\"basic:pagerank\":\"basic:pagerank\",\"basic:city\":\"basic:city\",\"basic:symbol_cluster\":\"basic:symbol_cluster\",\"basic:country\":\"basic:country\",\"basic:follow_cluster\":\"basic:follow_cluster\",\"basic:quality_cluster\":\"basic:quality_cluster\",\"basic:mobile\":\"basic:mobile\",\"basic:province\":\"basic:province\"}},{\"tableName\":\"status_feature_string\",\"joinKey\":\"basic:statusId\",\"columnsMapping\":{\"basic:source\":\"basic:source\",\"basic:title_hash\":\"basic:title_hash\",\"basic:symbol_id\":\"basic:symbol_id\",\"basic:retweet_user_id\":\"basic:retweet_user_id\",\"basic:user_id\":\"basic:user_id\",\"basic:title_length\":\"basic:title_length\",\"basic:retweet_status_id\":\"basic:retweet_status_id\",\"basic:flags\":\"basic:flags\",\"basic:paid_mention_user_id\":\"basic:paid_mention_user_id\",\"basic:created_at\":\"basic:created_at\",\"basic:retweet_symbol_id\":\"basic:retweet_symbol_id\",\"basic:expired_at\":\"basic:expired_at\",\"basic:truncated\":\"basic:truncated\"}}]}";
109 | 
110 | 
111 |         Assert.assertEquals(expected, jobConfigJosn);
112 | 
113 |         JobConfig reJobConfig = mapper.readValue(jobConfig.toString(), JobConfig.class);
114 |         reJobConfig.validate();
115 |         HbaseUtil hbaseUtil = new HbaseUtil(reJobConfig.getHbaseConfig());
116 |         hbaseUtil.prepareTable(reJobConfig.getTableName(), reJobConfig.families());
117 | 
118 |         Assert.assertEquals("#", reJobConfig.getRowKeyDelimiter());
119 |         Assert.assertEquals("[basic]", reJobConfig.families().toString());
120 | 
121 |         ObjectNode node = new ObjectNode(JsonNodeFactory.instance);
122 |         node.put("key", "2019-03-27 09:23:00");
123 |         node.put("value", input);
124 | 
125 |     }
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration debug="false">
 3 |     <!--定义日志文件的存储地址 勿在 LogBack 的配置中使用相对路径-->
 4 |     <property name="LOG_HOME" value="/Users/panda/Test" />
 5 |     <!-- 控制台输出 -->
 6 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 7 |         <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
 8 |             <!--格式化输出：%d表示日期，%thread表示线程名，%-5level：级别从左显示5个字符宽度%msg：日志消息，%n是换行符-->
 9 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern>
10 |         </encoder>
11 |     </appender>
12 |     <!-- 按照每天生成日志文件 -->
13 |     <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
14 |         <rollingPolicy class="ch.qos.logback.core.rolling.TimeBasedRollingPolicy">
15 |             <!--日志文件输出的文件名-->
16 |             <FileNamePattern>${LOG_HOME}/TestWeb.log.%d{yyyy-MM-dd}.log</FileNamePattern>
17 |             <!--日志文件保留天数-->
18 |             <MaxHistory>30</MaxHistory>
19 |         </rollingPolicy>
20 |         <encoder class="ch.qos.logback.classic.encoder.PatternLayoutEncoder">
21 |             <!--格式化输出：%d表示日期，%thread表示线程名，%-5level：级别从左显示5个字符宽度%msg：日志消息，%n是换行符-->
22 |             <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} - %msg%n</pattern>
23 |         </encoder>
24 |         <!--日志文件最大的大小-->
25 |         <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
26 |             <MaxFileSize>10MB</MaxFileSize>
27 |         </triggeringPolicy>
28 |     </appender>
29 | 
30 |     <!-- 日志输出级别 -->
31 |     <root level="INFO">
32 |         <appender-ref ref="STDOUT" />
33 |     </root>
34 | 
35 | </configuration>


--------------------------------------------------------------------------------