├── .babelrc ├── .editorconfig ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── codes ├── flink │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── io │ │ │ └── github │ │ │ └── dunwu │ │ │ └── bigdata │ │ │ └── flink │ │ │ ├── LineSplitter.java │ │ │ ├── WordCount.java │ │ │ ├── WordCountStreaming.java │ │ │ └── streaming │ │ │ ├── socket │ │ │ └── SocketWindowWordCount.java │ │ │ └── wordcount │ │ │ ├── WordCount.java │ │ │ └── util │ │ │ └── WordCountData.java │ │ └── resources │ │ └── logback.xml ├── hbase │ ├── README.md │ ├── hbase-java-api-1.x │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ ├── java │ │ │ │ └── io │ │ │ │ │ └── github │ │ │ │ │ └── dunwu │ │ │ │ │ └── bigdata │ │ │ │ │ └── hbase │ │ │ │ │ ├── HBaseTableDTO.java │ │ │ │ │ └── HbaseUtil.java │ │ │ └── resources │ │ │ │ └── log4j.xml │ │ │ └── test │ │ │ └── java │ │ │ └── io │ │ │ └── github │ │ │ └── dunwu │ │ │ └── bigdata │ │ │ └── hbase │ │ │ └── HbaseUtilTest.java │ ├── hbase-java-api-2.x │ │ ├── pom.xml │ │ └── src │ │ │ ├── main │ │ │ └── java │ │ │ │ └── io │ │ │ │ └── github │ │ │ │ └── dunwu │ │ │ │ └── bigdata │ │ │ │ └── hbase │ │ │ │ └── HBaseUtils.java │ │ │ └── test │ │ │ └── java │ │ │ └── io │ │ │ └── github │ │ │ └── dunwu │ │ │ └── bigdata │ │ │ └── hbase │ │ │ └── HBaseUtilsTest.java │ └── pom.xml ├── kafka │ ├── pom.xml │ └── src │ │ ├── main │ │ ├── java │ │ │ └── io │ │ │ │ └── github │ │ │ │ └── dunwu │ │ │ │ └── bigdata │ │ │ │ └── kafka │ │ │ │ ├── demo │ │ │ │ ├── KafkaConsumerCommitAsyncCallbackDemo.java │ │ │ │ ├── KafkaConsumerCommitAsyncDemo.java │ │ │ │ ├── KafkaConsumerCommitAutoDemo.java │ │ │ │ ├── KafkaConsumerCommitOffsetAsyncDemo.java │ │ │ │ ├── KafkaConsumerCommitSyncAndAsyncDemo.java │ │ │ │ ├── KafkaConsumerCommitSyncDemo.java │ │ │ │ ├── KafkaConsumerManualDemo.java │ │ │ │ ├── KafkaConsumerManualPartitionDemo.java │ │ │ │ ├── KafkaConsumerRebalanceListenerDemo.java │ │ │ │ ├── KafkaConsumerStartFromSpecifiedOffsetDemo.java │ │ │ │ ├── KafkaOnlyOneConsumer.java │ │ │ │ ├── KafkaProducerIdempotencyDemo.java │ │ │ │ ├── KafkaProducerSendAsyncCallbackDemo.java │ │ │ │ ├── KafkaProducerSendAsyncDemo.java │ │ │ │ ├── KafkaProducerSendSyncDemo.java │ │ │ │ ├── KafkaProducerTransactionDemo.java │ │ │ │ ├── KafkaStreamDemo.java │ │ │ │ └── KafkaTransactionDemo.java │ │ │ │ └── springboot │ │ │ │ ├── KafkaConsumer.java │ │ │ │ ├── KafkaProducer.java │ │ │ │ ├── KafkaProducerController.java │ │ │ │ ├── MsgKafkaApplication.java │ │ │ │ └── config │ │ │ │ ├── KafkaConsumerConfig.java │ │ │ │ └── KafkaProducerConfig.java │ │ └── resources │ │ │ ├── application.properties │ │ │ └── logback.xml │ │ └── test │ │ └── java │ │ └── io │ │ └── github │ │ └── dunwu │ │ └── bigdata │ │ └── kafka │ │ └── springboot │ │ └── KafkaProducerTest.java ├── scala │ └── src │ │ └── main │ │ └── scala │ │ ├── BreakDemo.scala │ │ ├── ClassDemo.scala │ │ ├── ClosureDemo.scala │ │ ├── ExceptionDemo.scala │ │ ├── ExceptionDemo2.scala │ │ ├── HelloWorld.scala │ │ ├── IfDemo.scala │ │ ├── MatchDemo.scala │ │ ├── SourceDemo.scala │ │ ├── StdInDemo.scala │ │ ├── TraitDemo.scala │ │ ├── WhileDemo.scala │ │ └── test.txt └── zookeeper │ ├── README.md │ ├── pom.xml │ └── src │ ├── main │ └── java │ │ └── io │ │ └── github │ │ └── dunwu │ │ └── bigdata │ │ └── zk │ │ ├── config │ │ ├── ActiveKeyValueStore.java │ │ ├── ConfigUpdater.java │ │ ├── ConfigWatcher.java │ │ ├── ConnectionWatcher.java │ │ ├── ResilientActiveKeyValueStore.java │ │ ├── ResilientConfigUpdater.java │ │ └── package-info.java │ │ ├── dlock │ │ ├── Callback.java │ │ ├── DLockTemplate.java │ │ ├── DistributedLock.java │ │ ├── TimeoutHandler.java │ │ ├── ZkDLockTemplate.java │ │ ├── ZkReentrantLockCleanerTask.java │ │ ├── ZookeeperReentrantDistributedLock.java │ │ └── package-info.java │ │ └── id │ │ ├── DistributedId.java │ │ └── ZkDistributedId.java │ └── test │ ├── java │ └── io │ │ └── github │ │ └── dunwu │ │ └── bigdata │ │ └── zk │ │ ├── curator │ │ ├── CuratorTest.java │ │ └── CuratorWatcherTest.java │ │ ├── dlock │ │ └── ZkReentrantLockTemplateTest.java │ │ ├── id │ │ └── ZkDistributedIdTest.java │ │ └── zookeeper │ │ └── ZooKeeperTest.java │ └── resources │ └── logback.xml ├── docs ├── .markdownlint.json ├── .vuepress │ ├── config.js │ ├── config │ │ ├── baiduCode.js │ │ └── htmlModules.js │ ├── enhanceApp.js │ ├── plugins │ │ └── love-me │ │ │ ├── index.js │ │ │ └── love-me.js │ ├── public │ │ ├── favicon.ico │ │ ├── img │ │ │ ├── bg.gif │ │ │ ├── dunwu-logo.png │ │ │ ├── favicon.ico │ │ │ ├── more.png │ │ │ └── other.png │ │ └── markmap │ │ │ └── 01.html │ └── styles │ │ ├── index.styl │ │ └── palette.styl ├── 16.大数据 │ ├── 00.综合 │ │ ├── 01.大数据简介.md │ │ ├── 02.大数据学习.md │ │ └── README.md │ ├── 01.hadoop │ │ ├── 01.hdfs │ │ │ ├── 01.HDFS入门.md │ │ │ ├── 02.HDFS运维.md │ │ │ ├── 03.HDFSJavaApi.md │ │ │ └── README.md │ │ ├── 02.yarn.md │ │ ├── 03.mapreduce.md │ │ └── README.md │ ├── 02.hive │ │ ├── 01.Hive入门.md │ │ ├── 02.Hive表.md │ │ ├── 03.Hive视图和索引.md │ │ ├── 04.Hive查询.md │ │ ├── 05.HiveDDL.md │ │ ├── 06.HiveDML.md │ │ ├── 07.Hive运维.md │ │ └── README.md │ ├── 03.hbase │ │ ├── 01.HBase原理.md │ │ ├── 02.HBase命令.md │ │ ├── 03.HBase运维.md │ │ └── README.md │ ├── 04.zookeeper │ │ ├── 01.ZooKeeper原理.md │ │ ├── 02.ZooKeeper命令.md │ │ ├── 03.ZooKeeper运维.md │ │ ├── 04.ZooKeeperJavaApi.md │ │ ├── 05.ZooKeeperAcl.md │ │ └── README.md │ ├── 11.spark │ │ └── 01.Spark简介.md │ ├── 13.flink │ │ ├── 01.Flink入门.md │ │ ├── 02.Flink简介.md │ │ ├── 03.FlinkETL.md │ │ ├── 04.Flink事件驱动.md │ │ ├── 05.FlinkApi.md │ │ ├── 06.Flink架构.md │ │ ├── 07.Flink运维.md │ │ ├── 08.FlinkTableApi.md │ │ └── README.md │ ├── 99.其他 │ │ ├── 01.flume.md │ │ └── 02.sqoop.md │ └── README.md ├── @pages │ ├── archivesPage.md │ ├── categoriesPage.md │ └── tagsPage.md └── README.md ├── package.json ├── pom.xml ├── prettier.config.js ├── scripts └── deploy.sh └── utils ├── config.yml ├── editFrontmatter.js └── modules ├── fn.js └── readFileList.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "compact": false 3 | } 4 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig 用于在 IDE 中检查代码的基本 Code Style 2 | # @see: https://editorconfig.org/ 3 | 4 | # 配置说明: 5 | # 所有文件换行使用 Unix 风格(LF),*.bat 文件使用 Windows 风格(CRLF) 6 | # java / sh 文件缩进 4 个空格,其他所有文件缩进 2 个空格 7 | 8 | root = true 9 | 10 | [*] 11 | end_of_line = lf 12 | indent_size = 2 13 | indent_style = space 14 | max_line_length = 120 15 | charset = utf-8 16 | trim_trailing_whitespace = true 17 | insert_final_newline = true 18 | 19 | [*.{bat, cmd}] 20 | end_of_line = crlf 21 | 22 | [*.{java, gradle, groovy, kt, sh, xml}] 23 | indent_size = 4 24 | 25 | [*.md] 26 | max_line_length = 0 27 | trim_trailing_whitespace = false 28 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | 3 | # plan text 4 | *.txt text 5 | *.java text 6 | *.scala text 7 | *.groovy text 8 | *.gradle text 9 | *.xml text 10 | *.xsd text 11 | *.tld text 12 | *.yaml text 13 | *.yml text 14 | *.wsdd text 15 | *.wsdl text 16 | *.jsp text 17 | *.jspf text 18 | *.js text 19 | *.jsx text 20 | *.json text 21 | *.css text 22 | *.less text 23 | *.sql text 24 | *.properties text 25 | *.md text 26 | 27 | # unix style 28 | *.sh text eol=lf 29 | 30 | # win style 31 | *.bat text eol=crlf 32 | 33 | # don't handle 34 | *.der -text 35 | *.jks -text 36 | *.pfx -text 37 | *.map -text 38 | *.patch -text 39 | *.dat -text 40 | *.data -text 41 | *.db -text 42 | 43 | # binary 44 | *.jar binary 45 | *.war binary 46 | *.zip binary 47 | *.tar binary 48 | *.tar.gz binary 49 | *.gz binary 50 | *.apk binary 51 | *.bin binary 52 | *.exe binary 53 | 54 | # images 55 | *.png binary 56 | *.jpg binary 57 | *.ico binary 58 | *.gif binary 59 | 60 | # medias 61 | *.mp3 binary 62 | *.swf binary 63 | 64 | # fonts 65 | *.eot binary 66 | *.svg binary 67 | *.ttf binary 68 | *.woff binary 69 | 70 | # others 71 | *.pdf binary 72 | *.doc binary 73 | *.docx binary 74 | *.ppt binary 75 | *.pptx binary 76 | *.xls binary 77 | *.xlsx binary 78 | *.xmind binary 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------------------- 2 | # more gitignore templates see https://github.com/github/gitignore 3 | # --------------------------------------------------------------------- 4 | 5 | # ------------------------------- java ------------------------------- 6 | # compiled folders 7 | classes 8 | target 9 | logs 10 | .mtj.tmp/ 11 | 12 | # compiled files 13 | *.class 14 | 15 | # bluej files 16 | *.ctxt 17 | 18 | # package files # 19 | *.jar 20 | *.war 21 | *.nar 22 | *.ear 23 | *.zip 24 | *.tar.gz 25 | *.rar 26 | 27 | # virtual machine crash logs 28 | hs_err_pid* 29 | 30 | # maven plugin temp files 31 | .flattened-pom.xml 32 | 33 | 34 | # ------------------------------- javascript ------------------------------- 35 | # dependencies 36 | node_modules 37 | 38 | # temp folders 39 | build 40 | dist 41 | _book 42 | _jsdoc 43 | .temp 44 | .deploy*/ 45 | 46 | # temp files 47 | *.log 48 | npm-debug.log* 49 | yarn-debug.log* 50 | yarn-error.log* 51 | bundle*.js 52 | .DS_Store 53 | Thumbs.db 54 | db.json 55 | book.pdf 56 | package-lock.json 57 | 58 | 59 | # ------------------------------- intellij ------------------------------- 60 | .idea 61 | *.iml 62 | 63 | 64 | # ------------------------------- eclipse ------------------------------- 65 | .classpath 66 | .project 67 | -------------------------------------------------------------------------------- /codes/flink/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | 7 | org.springframework.boot 8 | spring-boot-starter-parent 9 | 2.6.3 10 | 11 | 12 | io.github.dunwu.bigdata 13 | flink 14 | 大数据 - Flink 15 | 1.0.0 16 | jar 17 | 18 | 19 | 1.14.3 20 | 21 | 22 | 23 | 24 | org.apache.flink 25 | flink-java 26 | ${flink.version} 27 | 28 | 29 | org.apache.flink 30 | flink-core 31 | ${flink.version} 32 | 33 | 34 | org.apache.flink 35 | flink-clients_2.12 36 | ${flink.version} 37 | 38 | 39 | org.apache.flink 40 | flink-streaming-java_2.12 41 | ${flink.version} 42 | 43 | 44 | org.apache.flink 45 | flink-table-api-java-bridge_2.12 46 | ${flink.version} 47 | 48 | 49 | org.apache.flink 50 | flink-table-planner_2.12 51 | ${flink.version} 52 | 53 | 54 | org.apache.flink 55 | flink-test-utils_2.12 56 | ${flink.version} 57 | test 58 | 59 | 60 | 61 | org.springframework.boot 62 | spring-boot-starter 63 | 64 | 65 | org.springframework.boot 66 | spring-boot-starter-test 67 | test 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /codes/flink/src/main/java/io/github/dunwu/bigdata/flink/LineSplitter.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.flink; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.util.Collector; 6 | 7 | public class LineSplitter implements FlatMapFunction> { 8 | 9 | @Override 10 | public void flatMap(String value, Collector> out) { 11 | // normalize and split the line into words 12 | String[] tokens = value.toLowerCase().split("\\W+"); 13 | 14 | // emit the pairs 15 | for (String token : tokens) { 16 | if (token.length() > 0) { 17 | out.collect(new Tuple2(token, 1)); 18 | } 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /codes/flink/src/main/java/io/github/dunwu/bigdata/flink/WordCount.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.flink; 2 | 3 | import org.apache.flink.api.java.DataSet; 4 | import org.apache.flink.api.java.ExecutionEnvironment; 5 | import org.apache.flink.api.java.aggregation.Aggregations; 6 | import org.apache.flink.api.java.tuple.Tuple2; 7 | 8 | public class WordCount { 9 | 10 | public static void main(String[] args) throws Exception { 11 | 12 | // 设置运行环境 13 | final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 14 | 15 | // 配置数据源 16 | // you can also use env.readTextFile(...) to get words 17 | DataSet text = env.fromElements("To be, or not to be,--that is the question:--", 18 | "Whether 'tis nobler in the mind to suffer", 19 | "The slings and arrows of outrageous fortune", 20 | "Or to take arms against a sea of troubles,"); 21 | 22 | // 进行一系列转换 23 | DataSet> counts = 24 | // split up the lines in pairs (2-tuples) containing: (word,1) 25 | text.flatMap(new LineSplitter()) 26 | // group by the tuple field "0" and sum up tuple field "1" 27 | .groupBy(0).aggregate(Aggregations.SUM, 1); 28 | 29 | // emit result 30 | counts.print(); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /codes/flink/src/main/java/io/github/dunwu/bigdata/flink/WordCountStreaming.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.flink; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple2; 4 | import org.apache.flink.api.java.typeutils.TupleTypeInfo; 5 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | import org.apache.flink.util.Collector; 8 | 9 | public class WordCountStreaming { 10 | 11 | public static void main(String[] args) throws Exception { 12 | 13 | // 设置运行环境 14 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 15 | 16 | // 配置数据源 17 | DataStreamSource source = env.fromElements("To be, or not to be,--that is the question:--", 18 | "Whether 'tis nobler in the mind to suffer", 19 | "The slings and arrows of outrageous fortune", 20 | "Or to take arms against a sea of troubles"); 21 | 22 | // 进行一系列转换 23 | source 24 | // split up the lines in pairs (2-tuples) containing: (word,1) 25 | .flatMap((String value, Collector> out) -> { 26 | // emit the pairs 27 | for (String token : value.toLowerCase().split("\\W+")) { 28 | if (token.length() > 0) { 29 | out.collect(new Tuple2<>(token, 1)); 30 | } 31 | } 32 | }) 33 | // due to type erasure, we need to specify the return type 34 | .returns(TupleTypeInfo.getBasicTupleTypeInfo(String.class, Integer.class)) 35 | // group by the tuple field "0" 36 | .keyBy(0) 37 | // sum up tuple on field "1" 38 | .sum(1) 39 | // print the result 40 | .print(); 41 | 42 | // 提交执行 43 | env.execute(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /codes/flink/src/main/java/io/github/dunwu/bigdata/flink/streaming/socket/SocketWindowWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package io.github.dunwu.bigdata.flink.streaming.socket; 20 | 21 | import org.apache.flink.api.common.functions.FlatMapFunction; 22 | import org.apache.flink.api.common.functions.ReduceFunction; 23 | import org.apache.flink.api.java.utils.ParameterTool; 24 | import org.apache.flink.streaming.api.datastream.DataStream; 25 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 26 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows; 27 | import org.apache.flink.streaming.api.windowing.time.Time; 28 | import org.apache.flink.util.Collector; 29 | 30 | /** 31 | * Implements a streaming windowed version of the "WordCount" program. 32 | * 33 | *

This program connects to a server socket and reads strings from the socket. The easiest way to 34 | * try this out is to open a text server (at port 12345) using the netcat tool via 35 | * 36 | *

 37 |  * nc -l 12345 on Linux or nc -l -p 12345 on Windows
 38 |  * 
39 | * 40 | *

and run this example with the hostname and the port as arguments. 41 | */ 42 | @SuppressWarnings("serial") 43 | public class SocketWindowWordCount { 44 | 45 | public static void main(String[] args) throws Exception { 46 | 47 | // the host and the port to connect to 48 | final String hostname; 49 | final int port; 50 | try { 51 | final ParameterTool params = ParameterTool.fromArgs(args); 52 | hostname = params.has("hostname") ? params.get("hostname") : "localhost"; 53 | port = params.has("port") ? params.getInt("port") : 9000; 54 | } catch (Exception e) { 55 | System.err.println( 56 | "No port specified. Please run 'SocketWindowWordCount " 57 | + "--hostname --port ', where hostname (localhost by default) " 58 | + "and port is the address of the text server"); 59 | System.err.println( 60 | "To start a simple text server, run 'netcat -l ' and " 61 | + "type the input text into the command line"); 62 | return; 63 | } 64 | 65 | // get the execution environment 66 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 67 | 68 | // get input data by connecting to the socket 69 | DataStream text = env.socketTextStream(hostname, port, "\n"); 70 | 71 | // parse the data, group it, window it, and aggregate the counts 72 | DataStream windowCounts = 73 | text.flatMap( 74 | new FlatMapFunction() { 75 | @Override 76 | public void flatMap( 77 | String value, Collector out) { 78 | for (String word : value.split("\\s")) { 79 | out.collect(new WordWithCount(word, 1L)); 80 | } 81 | } 82 | }) 83 | .keyBy(value -> value.word) 84 | .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) 85 | .reduce( 86 | new ReduceFunction() { 87 | @Override 88 | public WordWithCount reduce(WordWithCount a, WordWithCount b) { 89 | return new WordWithCount(a.word, a.count + b.count); 90 | } 91 | }); 92 | 93 | // print the results with a single thread, rather than in parallel 94 | windowCounts.print().setParallelism(1); 95 | 96 | env.execute("Socket Window WordCount"); 97 | } 98 | 99 | // ------------------------------------------------------------------------ 100 | 101 | /** Data type for words with count. */ 102 | public static class WordWithCount { 103 | 104 | public String word; 105 | public long count; 106 | 107 | public WordWithCount() {} 108 | 109 | public WordWithCount(String word, long count) { 110 | this.word = word; 111 | this.count = count; 112 | } 113 | 114 | @Override 115 | public String toString() { 116 | return word + " : " + count; 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /codes/flink/src/main/java/io/github/dunwu/bigdata/flink/streaming/wordcount/util/WordCountData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package io.github.dunwu.bigdata.flink.streaming.wordcount.util; 20 | 21 | /** 22 | * Provides the default data sets used for the WordCount example program. The default data sets are 23 | * used, if no parameters are given to the program. 24 | */ 25 | public class WordCountData { 26 | 27 | public static final String[] WORDS = 28 | new String[] { 29 | "To be, or not to be,--that is the question:--", 30 | "Whether 'tis nobler in the mind to suffer", 31 | "The slings and arrows of outrageous fortune", 32 | "Or to take arms against a sea of troubles,", 33 | "And by opposing end them?--To die,--to sleep,--", 34 | "No more; and by a sleep to say we end", 35 | "The heartache, and the thousand natural shocks", 36 | "That flesh is heir to,--'tis a consummation", 37 | "Devoutly to be wish'd. To die,--to sleep;--", 38 | "To sleep! perchance to dream:--ay, there's the rub;", 39 | "For in that sleep of death what dreams may come,", 40 | "When we have shuffled off this mortal coil,", 41 | "Must give us pause: there's the respect", 42 | "That makes calamity of so long life;", 43 | "For who would bear the whips and scorns of time,", 44 | "The oppressor's wrong, the proud man's contumely,", 45 | "The pangs of despis'd love, the law's delay,", 46 | "The insolence of office, and the spurns", 47 | "That patient merit of the unworthy takes,", 48 | "When he himself might his quietus make", 49 | "With a bare bodkin? who would these fardels bear,", 50 | "To grunt and sweat under a weary life,", 51 | "But that the dread of something after death,--", 52 | "The undiscover'd country, from whose bourn", 53 | "No traveller returns,--puzzles the will,", 54 | "And makes us rather bear those ills we have", 55 | "Than fly to others that we know not of?", 56 | "Thus conscience does make cowards of us all;", 57 | "And thus the native hue of resolution", 58 | "Is sicklied o'er with the pale cast of thought;", 59 | "And enterprises of great pith and moment,", 60 | "With this regard, their currents turn awry,", 61 | "And lose the name of action.--Soft you now!", 62 | "The fair Ophelia!--Nymph, in thy orisons", 63 | "Be all my sins remember'd." 64 | }; 65 | } 66 | -------------------------------------------------------------------------------- /codes/hbase/README.md: -------------------------------------------------------------------------------- 1 | # HBase Demo 2 | 3 | 本项目是一个 Java API 访问 HBase 的示例 4 | -------------------------------------------------------------------------------- /codes/hbase/hbase-java-api-1.x/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | io.github.dunwu.bigdata 7 | hbase-java-api-1.x 8 | 大数据 - HBase - API - 1.x 9 | 1.0.0 10 | jar 11 | 12 | 13 | UTF-8 14 | UTF-8 15 | 1.8 16 | ${java.version} 17 | ${java.version} 18 | 19 | 20 | 21 | 22 | org.apache.hbase 23 | hbase-client 24 | 1.3.1 25 | 26 | 27 | log4j 28 | log4j 29 | 1.2.17 30 | 31 | 32 | junit 33 | junit 34 | 4.12 35 | 36 | 37 | io.github.dunwu 38 | dunwu-tool-core 39 | 0.5.6 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /codes/hbase/hbase-java-api-1.x/src/main/java/io/github/dunwu/bigdata/hbase/HBaseTableDTO.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.hbase; 2 | 3 | /** 4 | * @author Zhang Peng 5 | * @since 2019-03-04 6 | */ 7 | public class HBaseTableDTO { 8 | 9 | private String tableName; 10 | 11 | private String row; 12 | 13 | private String colFamily; 14 | 15 | private String col; 16 | 17 | private String val; 18 | 19 | public HBaseTableDTO() {} 20 | 21 | public HBaseTableDTO(String row, String colFamily, String col, String val) { 22 | this.row = row; 23 | this.colFamily = colFamily; 24 | this.col = col; 25 | this.val = val; 26 | } 27 | 28 | public HBaseTableDTO(String tableName, String row, String colFamily, String col, String val) { 29 | this.tableName = tableName; 30 | this.row = row; 31 | this.colFamily = colFamily; 32 | this.col = col; 33 | this.val = val; 34 | } 35 | 36 | public String getTableName() { 37 | return tableName; 38 | } 39 | 40 | public void setTableName(String tableName) { 41 | this.tableName = tableName; 42 | } 43 | 44 | public String getRow() { 45 | return row; 46 | } 47 | 48 | public void setRow(String row) { 49 | this.row = row; 50 | } 51 | 52 | public String getColFamily() { 53 | return colFamily; 54 | } 55 | 56 | public void setColFamily(String colFamily) { 57 | this.colFamily = colFamily; 58 | } 59 | 60 | public String getCol() { 61 | return col; 62 | } 63 | 64 | public void setCol(String col) { 65 | this.col = col; 66 | } 67 | 68 | public String getVal() { 69 | return val; 70 | } 71 | 72 | public void setVal(String val) { 73 | this.val = val; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /codes/hbase/hbase-java-api-1.x/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /codes/hbase/hbase-java-api-2.x/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | io.github.dunwu.bigdata 7 | hbase-java-api-2.x 8 | 大数据 - HBase - API - 2.x 9 | 1.0.0 10 | jar 11 | 12 | 13 | UTF-8 14 | UTF-8 15 | 1.8 16 | ${java.version} 17 | ${java.version} 18 | 19 | 20 | 21 | 22 | org.apache.hbase 23 | hbase-client 24 | 2.1.4 25 | 26 | 27 | junit 28 | junit 29 | 4.12 30 | test 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /codes/hbase/hbase-java-api-2.x/src/test/java/io/github/dunwu/bigdata/hbase/HBaseUtilsTest.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.hbase; 2 | 3 | import io.github.dunwu.bigdata.hbase.HBaseUtils; 4 | import javafx.util.Pair; 5 | import org.apache.hadoop.hbase.CompareOperator; 6 | import org.apache.hadoop.hbase.client.Result; 7 | import org.apache.hadoop.hbase.client.ResultScanner; 8 | import org.apache.hadoop.hbase.filter.CompareFilter; 9 | import org.apache.hadoop.hbase.filter.FilterList; 10 | import org.apache.hadoop.hbase.filter.SingleColumnValueFilter; 11 | import org.apache.hadoop.hbase.util.Bytes; 12 | import org.junit.Test; 13 | 14 | import java.util.Arrays; 15 | import java.util.List; 16 | 17 | public class HBaseUtilsTest { 18 | 19 | private static final String TABLE_NAME = "class"; 20 | private static final String TEACHER = "teacher"; 21 | private static final String STUDENT = "student"; 22 | 23 | @Test 24 | public void createTable() { 25 | // 新建表 26 | List columnFamilies = Arrays.asList(TEACHER, STUDENT); 27 | boolean table = HBaseUtils.createTable(TABLE_NAME, columnFamilies); 28 | System.out.println("表创建结果:" + table); 29 | } 30 | 31 | @Test 32 | public void insertData() { 33 | List> pairs1 = Arrays.asList(new Pair<>("name", "Tom"), 34 | new Pair<>("age", "22"), 35 | new Pair<>("gender", "1")); 36 | HBaseUtils.putRow(TABLE_NAME, "rowKey1", STUDENT, pairs1); 37 | 38 | List> pairs2 = Arrays.asList(new Pair<>("name", "Jack"), 39 | new Pair<>("age", "33"), 40 | new Pair<>("gender", "2")); 41 | HBaseUtils.putRow(TABLE_NAME, "rowKey2", STUDENT, pairs2); 42 | 43 | List> pairs3 = Arrays.asList(new Pair<>("name", "Mike"), 44 | new Pair<>("age", "44"), 45 | new Pair<>("gender", "1")); 46 | HBaseUtils.putRow(TABLE_NAME, "rowKey3", STUDENT, pairs3); 47 | } 48 | 49 | 50 | @Test 51 | public void getRow() { 52 | Result result = HBaseUtils.getRow(TABLE_NAME, "rowKey1"); 53 | if (result != null) { 54 | System.out.println(Bytes 55 | .toString(result.getValue(Bytes.toBytes(STUDENT), Bytes.toBytes("name")))); 56 | } 57 | 58 | } 59 | 60 | @Test 61 | public void getCell() { 62 | String cell = HBaseUtils.getCell(TABLE_NAME, "rowKey2", STUDENT, "age"); 63 | System.out.println("cell age :" + cell); 64 | 65 | } 66 | 67 | @Test 68 | public void getScanner() { 69 | ResultScanner scanner = HBaseUtils.getScanner(TABLE_NAME); 70 | if (scanner != null) { 71 | scanner.forEach(result -> System.out.println(Bytes.toString(result.getRow()) + "->" + Bytes 72 | .toString(result.getValue(Bytes.toBytes(STUDENT), Bytes.toBytes("name"))))); 73 | scanner.close(); 74 | } 75 | } 76 | 77 | 78 | @Test 79 | public void getScannerWithFilter() { 80 | FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL); 81 | SingleColumnValueFilter nameFilter = new SingleColumnValueFilter(Bytes.toBytes(STUDENT), 82 | Bytes.toBytes("name"), CompareOperator.EQUAL, Bytes.toBytes("Jack")); 83 | filterList.addFilter(nameFilter); 84 | ResultScanner scanner = HBaseUtils.getScanner(TABLE_NAME, filterList); 85 | if (scanner != null) { 86 | scanner.forEach(result -> System.out.println(Bytes.toString(result.getRow()) + "->" + Bytes 87 | .toString(result.getValue(Bytes.toBytes(STUDENT), Bytes.toBytes("name"))))); 88 | scanner.close(); 89 | } 90 | } 91 | 92 | @Test 93 | public void deleteColumn() { 94 | boolean b = HBaseUtils.deleteColumn(TABLE_NAME, "rowKey2", STUDENT, "age"); 95 | System.out.println("删除结果: " + b); 96 | } 97 | 98 | @Test 99 | public void deleteRow() { 100 | boolean b = HBaseUtils.deleteRow(TABLE_NAME, "rowKey2"); 101 | System.out.println("删除结果: " + b); 102 | } 103 | 104 | @Test 105 | public void deleteTable() { 106 | boolean b = HBaseUtils.deleteTable(TABLE_NAME); 107 | System.out.println("删除结果: " + b); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /codes/hbase/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | io.github.dunwu.bigdata 7 | hbase 8 | 大数据 - HBase 9 | 1.0.0 10 | pom 11 | 12 | 13 | UTF-8 14 | UTF-8 15 | 1.8 16 | ${java.version} 17 | ${java.version} 18 | 19 | 20 | 21 | hbase-java-api-1.x 22 | hbase-java-api-2.x 23 | 24 | 25 | -------------------------------------------------------------------------------- /codes/kafka/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | 7 | org.springframework.boot 8 | spring-boot-starter-parent 9 | 2.6.3 10 | 11 | 12 | io.github.dunwu.bigdata 13 | kafka 14 | 大数据 - Kafka 15 | 1.0.0 16 | jar 17 | 18 | 19 | 20 | org.springframework.boot 21 | spring-boot-starter-web 22 | 23 | 24 | org.springframework.boot 25 | spring-boot-starter-test 26 | test 27 | 28 | 29 | 30 | org.springframework.kafka 31 | spring-kafka 32 | 33 | 34 | org.apache.kafka 35 | kafka-streams 36 | 37 | 38 | org.springframework.kafka 39 | spring-kafka-test 40 | test 41 | 42 | 43 | 44 | 45 | 46 | 47 | org.springframework.boot 48 | spring-boot-maven-plugin 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitAsyncCallbackDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.clients.producer.ProducerConfig; 5 | import org.apache.kafka.common.TopicPartition; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.time.Duration; 10 | import java.util.Collections; 11 | import java.util.Map; 12 | import java.util.Properties; 13 | 14 | /** 15 | * Kafka 消费者异步提交2 16 | *

17 | * 在成功提交或碰到无法恢复的错误之前,commitSync() 会一直重试,但是 commitAsync() 不会。 18 | *

19 | * 它之所以不进行重试,是因为在它收到服务器响应的时候,可能有一个更大的偏移量已经提交成功。 20 | *

21 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 22 | * @author Zhang Peng 23 | * @since 2020-06-20 24 | */ 25 | public class KafkaConsumerCommitAsyncCallbackDemo { 26 | 27 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitAsyncCallbackDemo.class); 28 | private static KafkaConsumer consumer; 29 | 30 | static { 31 | // 指定消费者的配置 32 | final Properties properties = new Properties(); 33 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 34 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 35 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 36 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 37 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 38 | "org.apache.kafka.common.serialization.StringDeserializer"); 39 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 40 | "org.apache.kafka.common.serialization.StringDeserializer"); 41 | 42 | // 使用配置初始化 Kafka 消费者 43 | consumer = new KafkaConsumer<>(properties); 44 | } 45 | 46 | public static void main(String[] args) { 47 | 48 | // 订阅 Topic 49 | consumer.subscribe(Collections.singletonList("test")); 50 | 51 | // 轮询 52 | while (true) { 53 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 54 | for (ConsumerRecord record : records) { 55 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", record.topic(), 56 | record.partition(), record.offset(), record.key(), record.value()); 57 | } 58 | consumer.commitAsync(new OffsetCommitCallback() { 59 | @Override 60 | public void onComplete(Map offsets, Exception e) { 61 | if (e != null) { 62 | log.error("recv failed.", e); 63 | } else { 64 | offsets.forEach((k, v) -> { 65 | System.out.printf("recv success, topic = %s, partition = %s, offset = %d\n ", k.topic(), 66 | k.partition(), v.offset()); 67 | }); 68 | } 69 | } 70 | }); 71 | } 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitAsyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.apache.kafka.clients.producer.ProducerConfig; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.time.Duration; 12 | import java.util.Collections; 13 | import java.util.Properties; 14 | 15 | /** 16 | * Kafka 消费者异步提交 17 | *

18 | * 在成功提交或碰到无法恢复的错误之前,commitSync() 会一直重试,但是 commitAsync() 不会。 19 | *

20 | * 它之所以不进行重试,是因为在它收到服务器响应的时候,可能有一个更大的偏移量已经提交成功。 21 | *

22 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 23 | * @author Zhang Peng 24 | * @since 2020-06-20 25 | */ 26 | public class KafkaConsumerCommitAsyncDemo { 27 | 28 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitAsyncDemo.class); 29 | private static KafkaConsumer consumer; 30 | 31 | static { 32 | // 指定消费者的配置 33 | final Properties properties = new Properties(); 34 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 35 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 36 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 37 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 38 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 39 | "org.apache.kafka.common.serialization.StringDeserializer"); 40 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 41 | "org.apache.kafka.common.serialization.StringDeserializer"); 42 | 43 | // 使用配置初始化 Kafka 消费者 44 | consumer = new KafkaConsumer<>(properties); 45 | } 46 | 47 | public static void main(String[] args) { 48 | 49 | // 订阅 Topic 50 | consumer.subscribe(Collections.singletonList("test")); 51 | 52 | // 轮询 53 | while (true) { 54 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 55 | for (ConsumerRecord record : records) { 56 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", record.topic(), 57 | record.partition(), record.offset(), record.key(), record.value()); 58 | } 59 | 60 | // 异步提交 61 | consumer.commitAsync(); 62 | } 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitAutoDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.time.Duration; 11 | import java.util.Collections; 12 | import java.util.Properties; 13 | 14 | /** 15 | * Kafka 消费者自动提交 16 | *

17 | * 当 enable.auto.commit 属性被设为 true,那么每过 5s,消费者会自动把从 poll() 方法接收到的最大偏移量提交上去。 18 | *

19 | * 提交时间间隔由 auto.commit.interval.ms 控制,默认值是 5s。 20 | *

21 | * 自动提交虽然方便,不过无法避免重复消息问题。 22 | *

23 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 24 | * @author Zhang Peng 25 | * @since 2020-06-20 26 | */ 27 | public class KafkaConsumerCommitAutoDemo { 28 | 29 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitAutoDemo.class); 30 | private static KafkaConsumer consumer; 31 | 32 | static { 33 | // 指定消费者的配置 34 | final Properties properties = new Properties(); 35 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 36 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 37 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true"); 38 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 39 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 40 | "org.apache.kafka.common.serialization.StringDeserializer"); 41 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 42 | "org.apache.kafka.common.serialization.StringDeserializer"); 43 | 44 | // 使用配置初始化 Kafka 消费者 45 | consumer = new KafkaConsumer<>(properties); 46 | } 47 | 48 | public static void main(String[] args) { 49 | 50 | // 订阅 Topic 51 | consumer.subscribe(Collections.singletonList("test")); 52 | 53 | try { 54 | // 轮询 55 | while (true) { 56 | // 消费消息 57 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 58 | for (ConsumerRecord record : records) { 59 | log.info("topic = {}, partition = {}, offset = {}, key = {}, value = {}", record.topic(), 60 | record.partition(), record.offset(), record.key(), record.value()); 61 | } 62 | } 63 | } finally { 64 | // 关闭消费者 65 | consumer.close(); 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitOffsetAsyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.common.TopicPartition; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | import java.util.Collections; 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.Properties; 12 | 13 | /** 14 | * Kafka 消费者提交特定的偏移量 15 | *

16 | * 使用 commitSync() 提交偏移量最简单也最可靠。这个 API 会提交由 poll() 方法返回的最新偏移量,提交成功后马上返回,如果提交失败就抛出异常。 17 | *

18 | * 同步提交方式会一直阻塞,直到接收到 Broker 的响应请求,这会大大限制吞吐量。 19 | *

20 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 21 | * @author Zhang Peng 22 | * @since 2020-06-20 23 | */ 24 | public class KafkaConsumerCommitOffsetAsyncDemo { 25 | 26 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitOffsetAsyncDemo.class); 27 | private static KafkaConsumer consumer; 28 | private static int count = 0; 29 | // 用于跟踪偏移量的 map 30 | private static final Map offsets = new HashMap<>(); 31 | 32 | static { 33 | // 指定消费者的配置 34 | final Properties properties = new Properties(); 35 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 36 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 37 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 38 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 39 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 40 | "org.apache.kafka.common.serialization.StringDeserializer"); 41 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 42 | "org.apache.kafka.common.serialization.StringDeserializer"); 43 | 44 | // 使用配置初始化 Kafka 消费者 45 | consumer = new KafkaConsumer<>(properties); 46 | } 47 | 48 | public static void main(String[] args) { 49 | 50 | // 订阅 Topic 51 | consumer.subscribe(Collections.singletonList("test")); 52 | 53 | // 轮询 54 | while (true) { 55 | ConsumerRecords records = consumer.poll(100); 56 | for (ConsumerRecord record : records) { 57 | // 模拟业务处理 58 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", record.topic(), 59 | record.partition(), record.offset(), record.key(), record.value()); 60 | 61 | // 读取并处理记录后,更新 map 的偏移量 62 | offsets.put(new TopicPartition(record.topic(), record.partition()), 63 | new OffsetAndMetadata(record.offset() + 1, "no metadata")); 64 | 65 | // 这里的策略是每处理 1000 条记录就提交一次偏移量 66 | // 实际业务中,可以根据时间或记录内容,合理设置提交偏移量的触发条件 67 | if (count % 1000 == 0) { 68 | 69 | // 异步提交并设置回调,一旦提交请求得到响应(无论成败),就会进入回调处理 70 | consumer.commitAsync(offsets, new OffsetCommitCallback() { 71 | @Override 72 | public void onComplete(Map offsets, Exception e) { 73 | if (e != null) { 74 | log.error("recv failed.", e); 75 | } else { 76 | offsets.forEach((k, v) -> { 77 | System.out.printf("recv success, topic = %s, partition = %s, offset = %d\n ", 78 | k.topic(), k.partition(), v.offset()); 79 | }); 80 | } 81 | } 82 | }); 83 | } 84 | count++; 85 | } 86 | } 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitSyncAndAsyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.apache.kafka.clients.producer.ProducerConfig; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.time.Duration; 12 | import java.util.Collections; 13 | import java.util.Properties; 14 | 15 | /** 16 | * Kafka 消费者同步异步提交 17 | *

18 | * 针对偶尔出现的提交失败,不进行重试不会有太大问题,因为如果提交失败是因为临时问题导致的,那么后续的提交总会有成功的。 19 | *

20 | * 但如果这是发生在关闭消费者或再均衡前的最后一次提交,就要确保能够提交成功。 21 | *

22 | * 因此,在消费者关闭前一般会组合使用 commitSync() 和 commitAsync()。 23 | *

24 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 25 | * @author Zhang Peng 26 | * @since 2020-06-20 27 | */ 28 | public class KafkaConsumerCommitSyncAndAsyncDemo { 29 | 30 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitSyncAndAsyncDemo.class); 31 | private static KafkaConsumer consumer; 32 | 33 | static { 34 | // 指定消费者的配置 35 | final Properties properties = new Properties(); 36 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 37 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 38 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 39 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 40 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 41 | "org.apache.kafka.common.serialization.StringDeserializer"); 42 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 43 | "org.apache.kafka.common.serialization.StringDeserializer"); 44 | 45 | // 使用配置初始化 Kafka 消费者 46 | consumer = new KafkaConsumer<>(properties); 47 | } 48 | 49 | public static void main(String[] args) { 50 | 51 | // 订阅 Topic 52 | consumer.subscribe(Collections.singletonList("test")); 53 | 54 | try { 55 | // 轮询 56 | while (true) { 57 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 58 | for (ConsumerRecord record : records) { 59 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", 60 | record.topic(), record.partition(), record.offset(), record.key(), 61 | record.value()); 62 | } 63 | // 如果一切正常,使用 commitAsync() 来提交,这样吞吐量高。并且即使这次提交失败,下一次提交很可能会成功。 64 | consumer.commitAsync(); 65 | } 66 | } catch (Exception e) { 67 | log.error("Unexpected error", e); 68 | } finally { 69 | try { 70 | // 如果直接关闭消费者,就没有下一次提交了。这种情况,使用 commitSync() 方法一直重试,直到提交成功或发生无法恢复的错误 71 | consumer.commitSync(); 72 | } finally { 73 | consumer.close(); 74 | } 75 | } 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerCommitSyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.time.Duration; 8 | import java.util.Collections; 9 | import java.util.Properties; 10 | 11 | /** 12 | * Kafka 消费者同步提交 13 | *

14 | * 使用 commitSync() 提交偏移量最简单也最可靠。这个 API 会提交由 poll() 方法返回的最新偏移量,提交成功后马上返回,如果提交失败就抛出异常。 15 | *

16 | * 同步提交方式会一直阻塞,直到接收到 Broker 的响应请求,这会大大限制吞吐量。 17 | *

18 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 19 | * @author Zhang Peng 20 | * @since 2020-06-20 21 | */ 22 | public class KafkaConsumerCommitSyncDemo { 23 | 24 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerCommitSyncDemo.class); 25 | private static KafkaConsumer consumer; 26 | 27 | static { 28 | // 指定消费者的配置 29 | final Properties properties = new Properties(); 30 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 31 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 32 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 33 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 34 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 35 | "org.apache.kafka.common.serialization.StringDeserializer"); 36 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 37 | "org.apache.kafka.common.serialization.StringDeserializer"); 38 | 39 | // 使用配置初始化 Kafka 消费者 40 | consumer = new KafkaConsumer<>(properties); 41 | } 42 | 43 | public static void main(String[] args) { 44 | 45 | // 订阅 Topic 46 | consumer.subscribe(Collections.singletonList("test")); 47 | 48 | // 轮询 49 | while (true) { 50 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 51 | for (ConsumerRecord record : records) { 52 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", record.topic(), 53 | record.partition(), record.offset(), record.key(), record.value()); 54 | } 55 | try { 56 | // 同步提交 57 | consumer.commitSync(); 58 | } catch (CommitFailedException e) { 59 | log.error("commit failed", e); 60 | } 61 | } 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerManualDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | import java.time.Duration; 11 | import java.util.ArrayList; 12 | import java.util.Arrays; 13 | import java.util.List; 14 | import java.util.Properties; 15 | 16 | /** 17 | * @author Zhang Peng 18 | * @since 2018/7/12 19 | */ 20 | public class KafkaConsumerManualDemo { 21 | 22 | private static final String HOST = "localhost:9092"; 23 | private final Logger log = LoggerFactory.getLogger(this.getClass()); 24 | 25 | public static void main(String[] args) { 26 | // 创建消费者 27 | Properties props = new Properties(); 28 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, HOST); 29 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 30 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 31 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 32 | "org.apache.kafka.common.serialization.StringDeserializer"); 33 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 34 | "org.apache.kafka.common.serialization.StringDeserializer"); 35 | KafkaConsumer consumer = new KafkaConsumer<>(props); 36 | 37 | // 订阅主题 38 | consumer.subscribe(Arrays.asList("t1", "t2")); 39 | final int minBatchSize = 200; 40 | List> buffer = new ArrayList<>(); 41 | 42 | // 轮询 43 | try { 44 | while (true) { 45 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 46 | for (ConsumerRecord record : records) { 47 | buffer.add(record); 48 | } 49 | if (buffer.size() >= minBatchSize) { 50 | // 逻辑处理,例如保存到数据库 51 | consumer.commitSync(); 52 | buffer.clear(); 53 | } 54 | } 55 | } finally { 56 | // 退出前,关闭消费者 57 | consumer.close(); 58 | } 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerManualPartitionDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.common.TopicPartition; 5 | 6 | import java.util.Arrays; 7 | import java.util.Collections; 8 | import java.util.List; 9 | import java.util.Properties; 10 | 11 | /** 12 | * @author Zhang Peng 13 | * @since 2018/7/12 14 | */ 15 | public class KafkaConsumerManualPartitionDemo { 16 | 17 | private static final String HOST = "localhost:9092"; 18 | 19 | public static void main(String[] args) { 20 | Properties props = new Properties(); 21 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, HOST); 22 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "test2"); 23 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "false"); 24 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 25 | "org.apache.kafka.common.serialization.StringDeserializer"); 26 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 27 | "org.apache.kafka.common.serialization.StringDeserializer"); 28 | 29 | KafkaConsumer consumer = new KafkaConsumer<>(props); 30 | consumer.subscribe(Arrays.asList("t1")); 31 | 32 | try { 33 | while (true) { 34 | ConsumerRecords records = consumer.poll(Long.MAX_VALUE); 35 | for (TopicPartition partition : records.partitions()) { 36 | List> partitionRecords = records.records(partition); 37 | for (ConsumerRecord record : partitionRecords) { 38 | System.out.println(partition.partition() + ": " + record.offset() + ": " + record.value()); 39 | } 40 | long lastOffset = partitionRecords.get(partitionRecords.size() - 1).offset(); 41 | consumer.commitSync(Collections.singletonMap(partition, new OffsetAndMetadata(lastOffset + 1))); 42 | } 43 | } 44 | } finally { 45 | consumer.close(); 46 | } 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerRebalanceListenerDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.common.TopicPartition; 5 | import org.apache.kafka.common.errors.WakeupException; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.*; 10 | 11 | /** 12 | * Kafka 再均衡监听器示例 13 | * @author Zhang Peng 14 | * @since 2020-06-20 15 | */ 16 | public class KafkaConsumerRebalanceListenerDemo { 17 | 18 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerRebalanceListenerDemo.class); 19 | private static KafkaConsumer consumer; 20 | // 用于跟踪偏移量的 map 21 | private static final Map offsets = new HashMap<>(); 22 | 23 | static { 24 | final Properties props = new Properties(); 25 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 26 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 27 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true"); 28 | props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 29 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 30 | "org.apache.kafka.common.serialization.StringDeserializer"); 31 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 32 | "org.apache.kafka.common.serialization.StringDeserializer"); 33 | 34 | consumer = new KafkaConsumer<>(props); 35 | } 36 | 37 | public static void main(String[] args) { 38 | try { 39 | // 订阅主题,使用再均衡监听器 40 | consumer.subscribe(Collections.singletonList("test"), new HandleRebalance()); 41 | 42 | while (true) { 43 | ConsumerRecords records = consumer.poll(100); 44 | for (ConsumerRecord record : records) { 45 | System.out.printf("topic = %s, partition = %s, offset = % d, key = %s, value = %s\n ", 46 | record.topic(), record.partition(), record.offset(), record.key(), 47 | record.value()); 48 | offsets.put(new TopicPartition(record.topic(), record.partition()), 49 | new OffsetAndMetadata(record.offset() + 1, "no metadata")); 50 | } 51 | consumer.commitAsync(offsets, null); 52 | } 53 | } catch (WakeupException e) { 54 | // 忽略异常,正在关闭消费者 55 | } catch (Exception e) { 56 | log.error("Unexpected error", e); 57 | } finally { 58 | try { 59 | consumer.commitSync(offsets); 60 | } finally { 61 | consumer.close(); 62 | System.out.println("Closed consumer and we are done"); 63 | } 64 | } 65 | } 66 | 67 | // 实现 ConsumerRebalanceListener 接口 68 | private static class HandleRebalance implements ConsumerRebalanceListener { 69 | 70 | // 获得新分区后开始消费消息,不要做其他事 71 | @Override 72 | public void onPartitionsAssigned(Collection partitions) {} 73 | 74 | // 如果发生再均衡,我们要在即将逝去分区所有权时提交偏移量。 75 | // 注意:提交的是最近处理过的偏移量,而不是批次中还在处理的最后一个偏移量。因为分区有可能在我们还在处理消息的时候被撤回。 76 | // 提交的偏移量是已处理的,所以不会有问题。 77 | @Override 78 | public void onPartitionsRevoked(Collection partitions) { 79 | System.out.println("Lost partitions in rebalance.Committing current offsets:" + offsets); 80 | consumer.commitSync(offsets); 81 | } 82 | 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaConsumerStartFromSpecifiedOffsetDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.*; 4 | import org.apache.kafka.common.TopicPartition; 5 | import org.apache.kafka.common.errors.WakeupException; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.time.Duration; 10 | import java.util.*; 11 | 12 | /** 13 | * Kafka 消费者从特定偏移量处开始处理示例 14 | *

15 | * 使用 commitSync() 提交偏移量最简单也最可靠。这个 API 会提交由 poll() 方法返回的最新偏移量,提交成功后马上返回,如果提交失败就抛出异常。 16 | *

17 | * 同步提交方式会一直阻塞,直到接收到 Broker 的响应请求,这会大大限制吞吐量。 18 | *

19 | * 消费者配置参考:https://kafka.apache.org/documentation/#consumerconfigs 20 | * @author Zhang Peng 21 | * @since 2020-06-20 22 | */ 23 | public class KafkaConsumerStartFromSpecifiedOffsetDemo { 24 | 25 | private static final Logger log = LoggerFactory.getLogger(KafkaConsumerStartFromSpecifiedOffsetDemo.class); 26 | private static KafkaConsumer consumer; 27 | // 用于跟踪偏移量的 map 28 | private static final Map offsets = new HashMap<>(); 29 | 30 | static { 31 | final Properties props = new Properties(); 32 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 33 | props.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 34 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true"); 35 | props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 36 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 37 | "org.apache.kafka.common.serialization.StringDeserializer"); 38 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 39 | "org.apache.kafka.common.serialization.StringDeserializer"); 40 | 41 | consumer = new KafkaConsumer<>(props); 42 | } 43 | 44 | public static void main(String[] args) { 45 | try { 46 | // 订阅主题,使用再均衡监听器 47 | consumer.subscribe(Collections.singletonList("test"), new SaveOffsetsOnRebalance()); 48 | consumer.poll(Duration.ofMillis(0)); 49 | 50 | for (TopicPartition partition : consumer.assignment()) { 51 | consumer.seek(partition, mockGetOffsetsFromDB(partition)); 52 | } 53 | 54 | while (true) { 55 | ConsumerRecords records = consumer.poll(Duration.ofMillis(100)); 56 | for (ConsumerRecord record : records) { 57 | System.out.printf("topic = %s, partition = %s, offset = % d, key = %s, value = %s\n ", 58 | record.topic(), record.partition(), record.offset(), record.key(), 59 | record.value()); 60 | offsets.put(new TopicPartition(record.topic(), record.partition()), 61 | new OffsetAndMetadata(record.offset() + 1, "no metadata")); 62 | } 63 | mockSaveOffsetsToDB(offsets); 64 | consumer.commitSync(offsets); 65 | } 66 | } catch (WakeupException e) { 67 | // 忽略异常,正在关闭消费者 68 | } catch (Exception e) { 69 | log.error("Unexpected error", e); 70 | } finally { 71 | try { 72 | consumer.commitSync(offsets); 73 | } finally { 74 | consumer.close(); 75 | System.out.println("Closed consumer and we are done"); 76 | } 77 | } 78 | } 79 | 80 | /** 81 | * 实现 ConsumerRebalanceListener 接口 82 | */ 83 | private static class SaveOffsetsOnRebalance implements ConsumerRebalanceListener { 84 | 85 | // 获得新分区后开始消费消息,不要做其他事 86 | @Override 87 | public void onPartitionsAssigned(Collection partitions) { 88 | mockSaveOffsetsToDB(offsets); 89 | } 90 | 91 | @Override 92 | public void onPartitionsRevoked(Collection partitions) { 93 | for (TopicPartition partition : partitions) { 94 | System.out.println("查询偏移量"); 95 | consumer.seek(partition, mockGetOffsetsFromDB(partition)); 96 | } 97 | } 98 | 99 | } 100 | 101 | /** 102 | * 模拟提交数据库事务。一般是在处理完记录后,将记录和偏移量插入数据库,然后在即将逝去分区所有权之前提交事务,确保成功保存。 103 | */ 104 | public static void mockSaveOffsetsToDB(Map offsets) { 105 | System.out.println("模拟提交数据库事务"); 106 | offsets.forEach((k, v) -> { 107 | System.out.printf("\tpartition:%s, offset: %d\n", k.partition(), v.offset()); 108 | }); 109 | System.out.println(); 110 | } 111 | 112 | /** 113 | * 模拟从数据库中,根据分区查询已处理的偏移量 114 | */ 115 | public static OffsetAndMetadata mockGetOffsetsFromDB(TopicPartition partition) { 116 | return offsets.get(partition); 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaOnlyOneConsumer.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.apache.kafka.common.PartitionInfo; 8 | import org.apache.kafka.common.TopicPartition; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Collection; 12 | import java.util.List; 13 | import java.util.Properties; 14 | 15 | /** 16 | * Kafka 独立消费者示例 17 | * @author Zhang Peng 18 | * @since 2020-06-20 19 | */ 20 | public class KafkaOnlyOneConsumer { 21 | 22 | private static KafkaConsumer consumer; 23 | 24 | static { 25 | // 指定消费者的配置 26 | final Properties properties = new Properties(); 27 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 28 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, "test"); 29 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true"); 30 | properties.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "1000"); 31 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, 32 | "org.apache.kafka.common.serialization.StringDeserializer"); 33 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, 34 | "org.apache.kafka.common.serialization.StringDeserializer"); 35 | 36 | // 使用配置初始化 Kafka 消费者 37 | consumer = new KafkaConsumer<>(properties); 38 | } 39 | 40 | public static void main(String[] args) { 41 | Collection partitions = new ArrayList<>(); 42 | List partitionInfos = consumer.partitionsFor("test"); 43 | if (partitionInfos != null) { 44 | for (PartitionInfo partition : partitionInfos) { 45 | partitions.add(new TopicPartition(partition.topic(), partition.partition())); 46 | } 47 | consumer.assign(partitions); 48 | 49 | while (true) { 50 | ConsumerRecords records = consumer.poll(1000); 51 | 52 | for (ConsumerRecord record : records) { 53 | System.out.printf("topic = %s, partition = %s, offset = %d, key = %s, value = %s\n ", 54 | record.topic(), record.partition(), record.offset(), record.key(), 55 | record.value()); 56 | } 57 | consumer.commitSync(); 58 | } 59 | } 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaProducerIdempotencyDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.producer.*; 4 | 5 | import java.util.Properties; 6 | 7 | /** 8 | * Kafka 生产者幂等性 9 | *

10 | * 生产者配置参考:https://kafka.apache.org/documentation/#producerconfigs 11 | * @author Zhang Peng 12 | * @since 2020-06-20 13 | */ 14 | public class KafkaProducerIdempotencyDemo { 15 | 16 | private static Producer producer; 17 | 18 | static { 19 | // 指定生产者的配置 20 | final Properties properties = new Properties(); 21 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 22 | // 设置 key 的序列化器 23 | properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 24 | // 设置 value 的序列化器 25 | properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 26 | 27 | // 开启幂等性 28 | properties.put("enable.idempotence", true); 29 | // 设置重试次数 30 | properties.put("retries", 3); 31 | //Reduce the no of requests less than 0 32 | properties.put("linger.ms", 1); 33 | // buffer.memory 控制生产者可用于缓冲的内存总量 34 | properties.put("buffer.memory", 33554432); 35 | 36 | // 使用配置初始化 Kafka 生产者 37 | producer = new KafkaProducer<>(properties); 38 | } 39 | 40 | public static void main(String[] args) { 41 | try { 42 | // 使用 send 方法发送异步消息 43 | for (int i = 0; i < 100; i++) { 44 | String msg = "Message " + i; 45 | RecordMetadata metadata = producer.send(new ProducerRecord<>("test", msg)).get(); 46 | System.out.println("Sent:" + msg); 47 | System.out.printf("Sent success, topic = %s, partition = %s, offset = %d, timestamp = %s\n ", 48 | metadata.topic(), metadata.partition(), metadata.offset(), metadata.timestamp()); 49 | } 50 | producer.flush(); 51 | } catch (Exception e) { 52 | e.printStackTrace(); 53 | } finally { 54 | // 关闭生产者 55 | producer.close(); 56 | } 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaProducerSendAsyncCallbackDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.producer.*; 4 | 5 | import java.util.Properties; 6 | 7 | /** 8 | * Kafka 异步回调发送 9 | *

10 | * 返回一个 Future 对象,调用 get() 方法,会一直阻塞等待 Broker 返回结果。 11 | *

12 | * 这是一种可靠传输方式,但吞吐量最差。 13 | *

14 | * 生产者配置参考:https://kafka.apache.org/documentation/#producerconfigs 15 | * @author Zhang Peng 16 | * @since 2020-06-20 17 | */ 18 | public class KafkaProducerSendAsyncCallbackDemo { 19 | 20 | private static Producer producer; 21 | 22 | static { 23 | // 指定生产者的配置 24 | final Properties properties = new Properties(); 25 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 26 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, 27 | "org.apache.kafka.common.serialization.StringSerializer"); 28 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, 29 | "org.apache.kafka.common.serialization.StringSerializer"); 30 | properties.put(ProducerConfig.ACKS_CONFIG, "all"); 31 | properties.put(ProducerConfig.RETRIES_CONFIG, 2); 32 | properties.put(ProducerConfig.LINGER_MS_CONFIG, 1); 33 | properties.put(ProducerConfig.BATCH_SIZE_CONFIG, 2000); 34 | 35 | // 使用配置初始化 Kafka 生产者 36 | producer = new KafkaProducer<>(properties); 37 | } 38 | 39 | 40 | private static class DemoProducerCallback implements Callback { 41 | 42 | @Override 43 | public void onCompletion(RecordMetadata metadata, Exception e) { 44 | if (e != null) { 45 | e.printStackTrace(); 46 | } else { 47 | System.out.printf("Sent success, topic = %s, partition = %s, offset = %d, timestamp = %s\n ", 48 | metadata.topic(), metadata.partition(), metadata.offset(), metadata.timestamp()); 49 | } 50 | } 51 | 52 | } 53 | 54 | public static void main(String[] args) { 55 | try { 56 | // 使用 send 方法发送异步消息并设置回调,一旦发送请求得到响应(无论成败),就会进入回调处理 57 | for (int i = 0; i < 10000; i++) { 58 | String msg = "Message " + i; 59 | producer.send(new ProducerRecord<>("test", msg), new DemoProducerCallback()); 60 | } 61 | } catch (Exception e) { 62 | e.printStackTrace(); 63 | } finally { 64 | // 关闭生产者 65 | producer.close(); 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaProducerSendAsyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.producer.KafkaProducer; 4 | import org.apache.kafka.clients.producer.Producer; 5 | import org.apache.kafka.clients.producer.ProducerConfig; 6 | import org.apache.kafka.clients.producer.ProducerRecord; 7 | 8 | import java.util.Properties; 9 | 10 | /** 11 | * Kafka 异步发送 12 | *

13 | * 直接发送消息,不关心消息是否到达。 14 | *

15 | * 这种方式吞吐量最高,但有小概率会丢失消息。 16 | *

17 | * 生产者配置参考:https://kafka.apache.org/documentation/#producerconfigs 18 | * @author Zhang Peng 19 | * @since 2020-06-20 20 | */ 21 | public class KafkaProducerSendAsyncDemo { 22 | 23 | private static Producer producer; 24 | 25 | static { 26 | // 指定生产者的配置 27 | final Properties properties = new Properties(); 28 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 29 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, 30 | "org.apache.kafka.common.serialization.StringSerializer"); 31 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, 32 | "org.apache.kafka.common.serialization.StringSerializer"); 33 | 34 | // 使用配置初始化 Kafka 生产者 35 | producer = new KafkaProducer<>(properties); 36 | } 37 | 38 | public static void main(String[] args) { 39 | try { 40 | // 使用 send 方法发送异步消息 41 | for (int i = 0; i < 100; i++) { 42 | String msg = "Message " + i; 43 | producer.send(new ProducerRecord<>("test", msg)); 44 | System.out.println("Sent:" + msg); 45 | } 46 | } catch (Exception e) { 47 | e.printStackTrace(); 48 | } finally { 49 | // 关闭生产者 50 | producer.close(); 51 | } 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaProducerSendSyncDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.producer.*; 4 | 5 | import java.util.Properties; 6 | 7 | /** 8 | * Kafka 同步发送 9 | *

10 | * 返回一个 Future 对象,调用 get() 方法,会一直阻塞等待 Broker 返回结果。 11 | *

12 | * 这是一种可靠传输方式,但吞吐量最差。 13 | *

14 | * 生产者配置参考:https://kafka.apache.org/documentation/#producerconfigs 15 | * @author Zhang Peng 16 | * @since 2020-06-20 17 | */ 18 | public class KafkaProducerSendSyncDemo { 19 | 20 | private static Producer producer; 21 | 22 | static { 23 | // 指定生产者的配置 24 | final Properties properties = new Properties(); 25 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 26 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, 27 | "org.apache.kafka.common.serialization.StringSerializer"); 28 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, 29 | "org.apache.kafka.common.serialization.StringSerializer"); 30 | 31 | // 使用配置初始化 Kafka 生产者 32 | producer = new KafkaProducer<>(properties); 33 | } 34 | 35 | public static void main(String[] args) { 36 | try { 37 | // 使用 send 方法发送异步消息 38 | for (int i = 0; i < 100; i++) { 39 | String msg = "Message " + i; 40 | RecordMetadata metadata = producer.send(new ProducerRecord<>("test", msg)).get(); 41 | System.out.println("Sent:" + msg); 42 | System.out.printf("Sent success, topic = %s, partition = %s, offset = %d, timestamp = %s\n ", 43 | metadata.topic(), metadata.partition(), metadata.offset(), metadata.timestamp()); 44 | } 45 | } catch (Exception e) { 46 | e.printStackTrace(); 47 | } finally { 48 | // 关闭生产者 49 | producer.close(); 50 | } 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaProducerTransactionDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.clients.producer.*; 4 | 5 | import java.util.Properties; 6 | 7 | /** 8 | * Kafka 异步回调发送 9 | *

10 | * 返回一个 Future 对象,调用 get() 方法,会一直阻塞等待 Broker 返回结果。 11 | *

12 | * 这是一种可靠传输方式,但吞吐量最差。 13 | *

14 | * 生产者配置参考:https://kafka.apache.org/documentation/#producerconfigs 15 | * @author Zhang Peng 16 | * @since 2020-06-20 17 | */ 18 | public class KafkaProducerTransactionDemo { 19 | 20 | private static Producer producer; 21 | 22 | static { 23 | // 指定生产者的配置 24 | final Properties properties = new Properties(); 25 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 26 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, 27 | "org.apache.kafka.common.serialization.StringSerializer"); 28 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, 29 | "org.apache.kafka.common.serialization.StringSerializer"); 30 | properties.put(ProducerConfig.ACKS_CONFIG, "all"); 31 | properties.put(ProducerConfig.RETRIES_CONFIG, 2); 32 | properties.put(ProducerConfig.LINGER_MS_CONFIG, 1); 33 | properties.put(ProducerConfig.BATCH_SIZE_CONFIG, 2000); 34 | 35 | // 使用配置初始化 Kafka 生产者 36 | producer = new KafkaProducer<>(properties); 37 | } 38 | 39 | 40 | private static class DemoProducerCallback implements Callback { 41 | 42 | @Override 43 | public void onCompletion(RecordMetadata metadata, Exception e) { 44 | if (e != null) { 45 | e.printStackTrace(); 46 | } else { 47 | System.out.printf("Sent success, topic = %s, partition = %s, offset = %d, timestamp = %s\n ", 48 | metadata.topic(), metadata.partition(), metadata.offset(), metadata.timestamp()); 49 | } 50 | } 51 | 52 | } 53 | 54 | public static void main(String[] args) { 55 | 56 | // 1.初始化事务 57 | producer.initTransactions(); 58 | // 2.开启事务 59 | producer.beginTransaction(); 60 | 61 | try { 62 | // 3.kafka写操作集合 63 | // 3.1 do业务逻辑 64 | 65 | // 3.2 发送消息 66 | producer.send(new ProducerRecord("test", "transaction-data-1")); 67 | producer.send(new ProducerRecord("test", "transaction-data-2")); 68 | 69 | // 3.3 do其他业务逻辑,还可以发送其他topic的消息。 70 | 71 | // 4.事务提交 72 | producer.commitTransaction(); 73 | 74 | } catch (Exception e) { 75 | e.printStackTrace(); 76 | } finally { 77 | // 5.放弃事务 78 | producer.abortTransaction(); 79 | 80 | // 关闭生产者 81 | producer.close(); 82 | } 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/demo/KafkaStreamDemo.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.demo; 2 | 3 | import org.apache.kafka.common.serialization.Serdes; 4 | import org.apache.kafka.common.utils.Bytes; 5 | import org.apache.kafka.streams.KafkaStreams; 6 | import org.apache.kafka.streams.StreamsBuilder; 7 | import org.apache.kafka.streams.StreamsConfig; 8 | import org.apache.kafka.streams.kstream.KStream; 9 | import org.apache.kafka.streams.kstream.KTable; 10 | import org.apache.kafka.streams.kstream.Materialized; 11 | import org.apache.kafka.streams.kstream.Produced; 12 | import org.apache.kafka.streams.state.KeyValueStore; 13 | 14 | import java.util.Arrays; 15 | import java.util.Properties; 16 | 17 | /** 18 | * Kafka 流示例 19 | *

20 | * 消费者配置参考:https://kafka.apache.org/documentation/#streamsconfigs 21 | */ 22 | public class KafkaStreamDemo { 23 | 24 | private static Properties properties; 25 | 26 | static { 27 | properties = new Properties(); 28 | properties.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application"); 29 | properties.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092"); 30 | properties.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass()); 31 | properties.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass()); 32 | } 33 | 34 | public static void main(String[] args) { 35 | // 设置流构造器 36 | StreamsBuilder builder = new StreamsBuilder(); 37 | KStream textLines = builder.stream("TextLinesTopic"); 38 | KTable wordCounts = 39 | textLines.flatMapValues(textLine -> Arrays.asList(textLine.toLowerCase().split("\\W+"))) 40 | .groupBy((key, word) -> word) 41 | .count(Materialized.>as("counts-store")); 42 | wordCounts.toStream().to("WordsWithCountsTopic", Produced.with(Serdes.String(), Serdes.Long())); 43 | 44 | // 根据流构造器和流配置初始化 Kafka 流 45 | KafkaStreams streams = new KafkaStreams(builder.build(), properties); 46 | streams.start(); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/KafkaConsumer.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import org.springframework.kafka.annotation.KafkaListener; 6 | import org.springframework.stereotype.Component; 7 | 8 | /** 9 | * Kafka 消费者 10 | * @author Zhang Peng 11 | * @since 2018-11-28 12 | */ 13 | @Component 14 | public class KafkaConsumer { 15 | 16 | private final Logger log = LoggerFactory.getLogger(KafkaConsumer.class); 17 | 18 | @KafkaListener(topics = "test") 19 | public void processMessage(String data) { 20 | log.info("收到kafka消息:{}", data); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/KafkaProducer.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import org.springframework.beans.factory.annotation.Autowired; 6 | import org.springframework.kafka.core.KafkaTemplate; 7 | import org.springframework.stereotype.Component; 8 | import org.springframework.transaction.annotation.Transactional; 9 | 10 | /** 11 | * Kafka生产者 12 | * @author Zhang Peng 13 | * @since 2018-11-29 14 | */ 15 | @Component 16 | public class KafkaProducer { 17 | 18 | private final Logger log = LoggerFactory.getLogger(KafkaProducer.class); 19 | 20 | @Autowired 21 | private KafkaTemplate template; 22 | 23 | @Transactional(rollbackFor = RuntimeException.class) 24 | public void sendTransactionMsg(String topic, String data) { 25 | log.info("向kafka发送数据:[{}]", data); 26 | template.executeInTransaction(t -> { 27 | t.send(topic, "prepare"); 28 | if ("error".equals(data)) { 29 | throw new RuntimeException("failed"); 30 | } 31 | t.send(topic, "finish"); 32 | return true; 33 | }); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/KafkaProducerController.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot; 2 | 3 | import org.springframework.beans.factory.annotation.Autowired; 4 | import org.springframework.web.bind.annotation.RequestMapping; 5 | import org.springframework.web.bind.annotation.RestController; 6 | 7 | /** 8 | * spring-boot kafka 示例 9 | *

10 | * 此 Controller 作为生产者,接受REST接口传入的消息,并写入到指定 Kafka Topic 11 | *

12 | * 访问方式:http://localhost:8080/kafka/send?topic=xxx&data=xxx 13 | * @author Zhang Peng 14 | */ 15 | @RestController 16 | @RequestMapping("kafka") 17 | public class KafkaProducerController { 18 | 19 | @Autowired 20 | private KafkaProducer kafkaProducer; 21 | 22 | @RequestMapping("sendTx") 23 | public void send(String topic, String data) { 24 | kafkaProducer.sendTransactionMsg(topic, data); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/MsgKafkaApplication.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class MsgKafkaApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(MsgKafkaApplication.class, args); 11 | } 12 | 13 | } 14 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/config/KafkaConsumerConfig.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot.config; 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerConfig; 4 | import org.apache.kafka.common.serialization.StringDeserializer; 5 | import org.springframework.beans.factory.annotation.Value; 6 | import org.springframework.context.annotation.Bean; 7 | import org.springframework.context.annotation.Configuration; 8 | import org.springframework.kafka.annotation.EnableKafka; 9 | import org.springframework.kafka.config.ConcurrentKafkaListenerContainerFactory; 10 | import org.springframework.kafka.config.KafkaListenerContainerFactory; 11 | import org.springframework.kafka.core.ConsumerFactory; 12 | import org.springframework.kafka.core.DefaultKafkaConsumerFactory; 13 | import org.springframework.kafka.listener.ConcurrentMessageListenerContainer; 14 | 15 | import java.util.HashMap; 16 | import java.util.Map; 17 | 18 | @Configuration 19 | @EnableKafka 20 | public class KafkaConsumerConfig { 21 | 22 | @Value("${spring.kafka.bootstrap-servers}") 23 | private String bootstrapServers; 24 | 25 | @Bean(name = "kafkaListenerContainerFactory") 26 | public KafkaListenerContainerFactory> kafkaListenerContainerFactory() { 27 | ConcurrentKafkaListenerContainerFactory factory = 28 | new ConcurrentKafkaListenerContainerFactory<>(); 29 | factory.setConsumerFactory(consumerFactory("groupA")); 30 | factory.setConcurrency(3); 31 | factory.getContainerProperties().setPollTimeout(3000); 32 | return factory; 33 | } 34 | 35 | public ConsumerFactory consumerFactory(String consumerGroupId) { 36 | return new DefaultKafkaConsumerFactory<>(consumerConfig(consumerGroupId)); 37 | } 38 | 39 | public Map consumerConfig(String consumerGroupId) { 40 | Map props = new HashMap<>(); 41 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); 42 | props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false); 43 | props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "100"); 44 | props.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG, "15000"); 45 | props.put(ConsumerConfig.GROUP_ID_CONFIG, consumerGroupId); 46 | props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); 47 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); 48 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); 49 | return props; 50 | } 51 | 52 | @Bean(name = "kafkaListenerContainerFactory1") 53 | public KafkaListenerContainerFactory> kafkaListenerContainerFactory1() { 54 | ConcurrentKafkaListenerContainerFactory factory = 55 | new ConcurrentKafkaListenerContainerFactory<>(); 56 | factory.setConsumerFactory(consumerFactory("groupB")); 57 | factory.setConcurrency(3); 58 | factory.getContainerProperties().setPollTimeout(3000); 59 | return factory; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /codes/kafka/src/main/java/io/github/dunwu/bigdata/kafka/springboot/config/KafkaProducerConfig.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot.config; 2 | 3 | import org.apache.kafka.clients.producer.ProducerConfig; 4 | import org.apache.kafka.common.serialization.StringSerializer; 5 | import org.springframework.beans.factory.annotation.Value; 6 | import org.springframework.context.annotation.Bean; 7 | import org.springframework.context.annotation.Configuration; 8 | import org.springframework.kafka.annotation.EnableKafka; 9 | import org.springframework.kafka.core.DefaultKafkaProducerFactory; 10 | import org.springframework.kafka.core.KafkaTemplate; 11 | import org.springframework.kafka.core.ProducerFactory; 12 | import org.springframework.kafka.transaction.KafkaTransactionManager; 13 | 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | 17 | @Configuration 18 | @EnableKafka 19 | public class KafkaProducerConfig { 20 | 21 | @Value("${spring.kafka.bootstrap-servers}") 22 | private String bootstrapServers; 23 | 24 | @Value("${spring.kafka.producer.retries}") 25 | private Integer retries; 26 | 27 | @Value("${spring.kafka.producer.batch-size}") 28 | private Integer batchSize; 29 | 30 | @Bean 31 | public KafkaTransactionManager transactionManager() { 32 | KafkaTransactionManager manager = new KafkaTransactionManager(producerFactory()); 33 | return manager; 34 | } 35 | 36 | @Bean 37 | public ProducerFactory producerFactory() { 38 | DefaultKafkaProducerFactory producerFactory = 39 | new DefaultKafkaProducerFactory<>(producerConfigs()); 40 | producerFactory.transactionCapable(); 41 | producerFactory.setTransactionIdPrefix("hous-"); 42 | return producerFactory; 43 | } 44 | 45 | @Bean 46 | public Map producerConfigs() { 47 | Map props = new HashMap<>(7); 48 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); 49 | props.put(ProducerConfig.RETRIES_CONFIG, retries); 50 | props.put(ProducerConfig.BATCH_SIZE_CONFIG, batchSize); 51 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); 52 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); 53 | return props; 54 | } 55 | 56 | @Bean 57 | public KafkaTemplate kafkaTemplate() { 58 | return new KafkaTemplate<>(producerFactory()); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /codes/kafka/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | server.port = 18080 2 | spring.kafka.bootstrap-servers = localhost:9092 3 | #spring.kafka.bootstrap-servers = tdh60dev01:9092,tdh60dev02:9092,tdh60dev03:9092 4 | spring.kafka.producer.retries = 3 5 | spring.kafka.producer.transaction-id-prefix = bigdata-kafka 6 | # producer 7 | spring.kafka.producer.batch-size = 1000 8 | spring.kafka.producer.key-serializer = org.apache.kafka.common.serialization.StringSerializer 9 | spring.kafka.producer.value-serializer = org.apache.kafka.common.serialization.StringSerializer 10 | # consumer 11 | spring.kafka.consumer.group-id = bigdata 12 | spring.kafka.consumer.enable-auto-commit = true 13 | spring.kafka.consumer.auto-commit-interval = 1000 14 | spring.kafka.consumer.key-deserializer = org.apache.kafka.common.serialization.StringDeserializer 15 | spring.kafka.consumer.value-deserializer = org.apache.kafka.common.serialization.StringDeserializer 16 | -------------------------------------------------------------------------------- /codes/kafka/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | %d{HH:mm:ss.SSS} [%boldYellow(%thread)] [%highlight(%-5level)] %boldGreen(%c{36}.%M) - %boldBlue(%m%n) 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /codes/kafka/src/test/java/io/github/dunwu/bigdata/kafka/springboot/KafkaProducerTest.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.kafka.springboot; 2 | 3 | import org.junit.jupiter.api.Test; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.boot.test.context.SpringBootTest; 6 | 7 | /** 8 | * @author Zhang Peng 9 | * @since 2018-11-29 10 | */ 11 | @SpringBootTest(classes = MsgKafkaApplication.class) 12 | public class KafkaProducerTest { 13 | 14 | @Autowired 15 | private KafkaProducer kafkaProducer; 16 | 17 | @Test 18 | public void test() { 19 | kafkaProducer.sendTransactionMsg("test", "上联:天王盖地虎"); 20 | kafkaProducer.sendTransactionMsg("test", "下联:宝塔镇河妖"); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/BreakDemo.scala: -------------------------------------------------------------------------------- 1 | import scala.util.control._ 2 | 3 | /** 4 | * 循环 Break 控制示例 5 | * 6 | * @author peng.zhang 7 | */ 8 | object BreakDemo { 9 | def main(args: Array[String]) { 10 | var a = 0; 11 | var b = 0; 12 | val numList1 = List(1, 2, 3, 4, 5); 13 | val numList2 = List(11, 12, 13); 14 | 15 | val outer = new Breaks; 16 | val inner = new Breaks; 17 | 18 | outer.breakable { 19 | for (a <- numList1) { 20 | println("Value of a: " + a); 21 | inner.breakable { 22 | for (b <- numList2) { 23 | println("Value of b: " + b); 24 | if (b == 12) { 25 | inner.break; 26 | } 27 | } 28 | } // 内嵌循环中断 29 | } 30 | } // 外部循环中断 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/ClassDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 类和对象示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | class Point(val xc: Int, val yc: Int) { 7 | var x: Int = xc 8 | var y: Int = yc 9 | 10 | def move(dx: Int, dy: Int) { 11 | x = x + dx 12 | y = y + dy 13 | println("x 的坐标点 : " + x); 14 | println("y 的坐标点 : " + y); 15 | } 16 | } 17 | 18 | class Location(override val xc: Int, override val yc: Int, val zc: Int) 19 | extends Point(xc, yc) { 20 | var z: Int = zc 21 | 22 | def move(dx: Int, dy: Int, dz: Int) { 23 | x = x + dx 24 | y = y + dy 25 | z = z + dz 26 | println("x 的坐标点 : " + x); 27 | println("y 的坐标点 : " + y); 28 | println("z 的坐标点 : " + z); 29 | } 30 | } 31 | 32 | object Test { 33 | def main(args: Array[String]) { 34 | val loc = new Location(10, 20, 15); 35 | 36 | // 移到一个新的位置 37 | loc.move(10, 10, 5); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/ClosureDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 闭包示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | object ClosureDemo { 7 | def main(args: Array[String]) { 8 | println("muliplier(1) value = " + multiplier(1)) 9 | println("muliplier(2) value = " + multiplier(2)) 10 | } 11 | 12 | var factor = 3 13 | val multiplier = (i: Int) => i * factor 14 | } 15 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/ExceptionDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 捕获异常示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | import java.io.{FileNotFoundException, FileReader, IOException} 7 | 8 | object Test { 9 | def main(args: Array[String]) { 10 | try { 11 | val f = new FileReader("input.txt") 12 | } catch { 13 | case ex: FileNotFoundException => { 14 | println("Missing file exception") 15 | } 16 | case ex: IOException => { 17 | println("IO Exception") 18 | } 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/ExceptionDemo2.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 捕获异常示例,含 finally 3 | * 4 | * @author peng.zhang 5 | */ 6 | import java.io.{FileNotFoundException, FileReader, IOException} 7 | 8 | object Test { 9 | def main(args: Array[String]) { 10 | try { 11 | val f = new FileReader("input.txt") 12 | } catch { 13 | case ex: FileNotFoundException => { 14 | println("Missing file exception") 15 | } 16 | case ex: IOException => { 17 | println("IO Exception") 18 | } 19 | } finally { 20 | println("Exiting finally...") 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/HelloWorld.scala: -------------------------------------------------------------------------------- 1 | object HelloWorld { 2 | /* 3 | * 这是我的第一个 Scala 程序 4 | * 以下程序将输出'Hello World!' 5 | */ 6 | def main(args: Array[String]) { 7 | println("Hello, world!") // 输出 Hello World 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/IfDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 条件控制语句示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | object IfDemo { 7 | def main(args: Array[String]) { 8 | var x = 30; 9 | 10 | if (x == 10) { 11 | println("X 的值为 10"); 12 | } else if (x == 20) { 13 | println("X 的值为 20"); 14 | } else if (x == 30) { 15 | println("X 的值为 30"); 16 | } else { 17 | println("无法判断 X 的值"); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/MatchDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * @author peng.zhang 3 | */ 4 | object MatchDemo { 5 | def main(args: Array[String]) { 6 | println(matchTest("two")) 7 | println(matchTest("test")) 8 | println(matchTest(1)) 9 | println(matchTest(6)) 10 | 11 | } 12 | 13 | def matchTest(x: Any): Any = x match { 14 | case 1 => "one" 15 | case "two" => 2 16 | case y: Int => "scala.Int" 17 | case _ => "many" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/SourceDemo.scala: -------------------------------------------------------------------------------- 1 | import scala.io.Source 2 | 3 | /** 4 | * Source 示例 5 | * 6 | * @author peng.zhang 7 | */ 8 | object SourceDemo { 9 | def main(args: Array[String]) { 10 | println("文件内容为:") 11 | 12 | Source.fromFile("test.txt").foreach { 13 | print 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/StdInDemo.scala: -------------------------------------------------------------------------------- 1 | import scala.io._ 2 | 3 | /** 4 | * StdIn 示例 5 | * 6 | * @author peng.zhang 7 | */ 8 | object StdInDemo { 9 | def main(args: Array[String]) { 10 | print("请输入内容: ") 11 | val line = StdIn.readLine() 12 | 13 | println("你输入的是: " + line) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/TraitDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * trait 示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | trait Equal { 7 | def isEqual(x: Any): Boolean 8 | 9 | def isNotEqual(x: Any): Boolean = !isEqual(x) 10 | } 11 | 12 | class Point(xc: Int, yc: Int) extends Equal { 13 | var x: Int = xc 14 | var y: Int = yc 15 | 16 | def isEqual(obj: Any) = 17 | obj.isInstanceOf[Point] && 18 | obj.asInstanceOf[Point].x == x 19 | } 20 | 21 | object TraitDemo { 22 | def main(args: Array[String]) { 23 | val p1 = new Point(2, 3) 24 | val p2 = new Point(2, 4) 25 | val p3 = new Point(3, 3) 26 | 27 | println(p1.isNotEqual(p2)) 28 | println(p1.isNotEqual(p3)) 29 | println(p1.isNotEqual(2)) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/WhileDemo.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * 循环控制语句示例 3 | * 4 | * @author peng.zhang 5 | */ 6 | object WhileDemo { 7 | def main(args: Array[String]) { 8 | // 局部变量 9 | var a = 10; 10 | 11 | // while 循环执行 12 | while (a < 20) { 13 | println("Value of a: " + a); 14 | a = a + 1; 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /codes/scala/src/main/scala/test.txt: -------------------------------------------------------------------------------- 1 | To do or not to do, that's a question. 2 | -------------------------------------------------------------------------------- /codes/zookeeper/README.md: -------------------------------------------------------------------------------- 1 | # ZooKeeper Demo 2 | 3 | - ZooKeeper API 基本示例 4 | - ZooKeeper 典型应用示例 5 | - 分布式 ID 6 | - 分布式锁 7 | - 分布式配置管理 -------------------------------------------------------------------------------- /codes/zookeeper/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | 7 | org.springframework.boot 8 | spring-boot-starter-parent 9 | 2.6.3 10 | 11 | 12 | io.github.dunwu.bigdata 13 | zookeeper 14 | 大数据 - ZooKeeper 15 | 1.0.0 16 | jar 17 | 18 | 19 | UTF-8 20 | UTF-8 21 | 1.8 22 | ${java.version} 23 | ${java.version} 24 | 25 | 26 | 27 | 28 | org.apache.zookeeper 29 | zookeeper 30 | 3.7.0 31 | 32 | 33 | org.apache.curator 34 | curator-recipes 35 | 5.1.0 36 | 37 | 38 | cn.hutool 39 | hutool-all 40 | 5.7.20 41 | 42 | 43 | org.springframework.boot 44 | spring-boot-starter-test 45 | test 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ActiveKeyValueStore.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import org.apache.zookeeper.CreateMode; 4 | import org.apache.zookeeper.KeeperException; 5 | import org.apache.zookeeper.Watcher; 6 | import org.apache.zookeeper.ZooDefs.Ids; 7 | import org.apache.zookeeper.data.Stat; 8 | 9 | import java.nio.charset.Charset; 10 | 11 | public class ActiveKeyValueStore extends ConnectionWatcher { 12 | 13 | private static final Charset CHARSET = Charset.forName("UTF-8"); 14 | 15 | public void write(String path, String value) throws InterruptedException, 16 | KeeperException { 17 | Stat stat = zk.exists(path, false); 18 | if (stat == null) { 19 | zk.create(path, value.getBytes(CHARSET), Ids.OPEN_ACL_UNSAFE, 20 | CreateMode.PERSISTENT); 21 | } else { 22 | zk.setData(path, value.getBytes(CHARSET), -1); 23 | } 24 | } 25 | 26 | public String read(String path, Watcher watcher) throws InterruptedException, 27 | KeeperException { 28 | byte[] data = zk.getData(path, watcher, null/*stat*/); 29 | return new String(data, CHARSET); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ConfigUpdater.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import org.apache.zookeeper.KeeperException; 4 | 5 | import java.io.IOException; 6 | import java.util.Random; 7 | import java.util.concurrent.TimeUnit; 8 | 9 | public class ConfigUpdater { 10 | 11 | public static final String PATH = "/config"; 12 | 13 | private ActiveKeyValueStore store; 14 | private Random random = new Random(); 15 | 16 | public ConfigUpdater(String hosts) throws IOException, InterruptedException { 17 | store = new ActiveKeyValueStore(); 18 | store.connect(hosts); 19 | } 20 | 21 | public void run() throws InterruptedException, KeeperException { 22 | while (true) { 23 | String value = random.nextInt(100) + ""; 24 | store.write(PATH, value); 25 | System.out.printf("Set %s to %s\n", PATH, value); 26 | TimeUnit.SECONDS.sleep(random.nextInt(10)); 27 | } 28 | } 29 | 30 | public static void main(String[] args) throws Exception { 31 | ConfigUpdater configUpdater = new ConfigUpdater("localhost"); 32 | configUpdater.run(); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ConfigWatcher.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import org.apache.zookeeper.KeeperException; 4 | import org.apache.zookeeper.WatchedEvent; 5 | import org.apache.zookeeper.Watcher; 6 | import org.apache.zookeeper.Watcher.Event.EventType; 7 | 8 | import java.io.IOException; 9 | 10 | public class ConfigWatcher implements Watcher { 11 | 12 | private ActiveKeyValueStore store; 13 | 14 | public ConfigWatcher(String hosts) throws IOException, InterruptedException { 15 | store = new ActiveKeyValueStore(); 16 | store.connect(hosts); 17 | } 18 | 19 | public void displayConfig() throws InterruptedException, KeeperException { 20 | String value = store.read(ConfigUpdater.PATH, this); 21 | System.out.printf("Read %s as %s\n", ConfigUpdater.PATH, value); 22 | } 23 | 24 | @Override 25 | public void process(WatchedEvent event) { 26 | if (event.getType() == EventType.NodeDataChanged) { 27 | try { 28 | displayConfig(); 29 | } catch (InterruptedException e) { 30 | System.err.println("Interrupted. Exiting."); 31 | Thread.currentThread().interrupt(); 32 | } catch (KeeperException e) { 33 | System.err.printf("KeeperException: %s. Exiting.\n", e); 34 | } 35 | } 36 | } 37 | 38 | public static void main(String[] args) throws Exception { 39 | ConfigWatcher configWatcher = new ConfigWatcher("localhost"); 40 | configWatcher.displayConfig(); 41 | 42 | // stay alive until process is killed or thread is interrupted 43 | Thread.sleep(Long.MAX_VALUE); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ConnectionWatcher.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import org.apache.zookeeper.WatchedEvent; 4 | import org.apache.zookeeper.Watcher; 5 | import org.apache.zookeeper.Watcher.Event.KeeperState; 6 | import org.apache.zookeeper.ZooKeeper; 7 | 8 | import java.io.IOException; 9 | import java.util.concurrent.CountDownLatch; 10 | 11 | public class ConnectionWatcher implements Watcher { 12 | 13 | private static final int SESSION_TIMEOUT = 5000; 14 | 15 | protected ZooKeeper zk; 16 | private CountDownLatch connectedSignal = new CountDownLatch(1); 17 | 18 | public void connect(String hosts) throws IOException, InterruptedException { 19 | zk = new ZooKeeper(hosts, SESSION_TIMEOUT, this); 20 | connectedSignal.await(); 21 | } 22 | 23 | @Override 24 | public void process(WatchedEvent event) { 25 | if (event.getState() == KeeperState.SyncConnected) { 26 | connectedSignal.countDown(); 27 | } 28 | } 29 | 30 | public void close() throws InterruptedException { 31 | zk.close(); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ResilientActiveKeyValueStore.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import org.apache.zookeeper.CreateMode; 4 | import org.apache.zookeeper.KeeperException; 5 | import org.apache.zookeeper.Watcher; 6 | import org.apache.zookeeper.ZooDefs.Ids; 7 | import org.apache.zookeeper.data.Stat; 8 | 9 | import java.nio.charset.Charset; 10 | import java.util.concurrent.TimeUnit; 11 | 12 | public class ResilientActiveKeyValueStore extends ConnectionWatcher { 13 | 14 | private static final Charset CHARSET = Charset.forName("UTF-8"); 15 | private static final int MAX_RETRIES = 5; 16 | private static final int RETRY_PERIOD_SECONDS = 10; 17 | 18 | public void write(String path, String value) throws InterruptedException, 19 | KeeperException { 20 | int retries = 0; 21 | while (true) { 22 | try { 23 | Stat stat = zk.exists(path, false); 24 | if (stat == null) { 25 | zk.create(path, value.getBytes(CHARSET), Ids.OPEN_ACL_UNSAFE, 26 | CreateMode.PERSISTENT); 27 | } else { 28 | zk.setData(path, value.getBytes(CHARSET), stat.getVersion()); 29 | } 30 | return; 31 | } catch (KeeperException.SessionExpiredException e) { 32 | throw e; 33 | } catch (KeeperException e) { 34 | if (retries++ == MAX_RETRIES) { 35 | throw e; 36 | } 37 | // sleep then retry 38 | TimeUnit.SECONDS.sleep(RETRY_PERIOD_SECONDS); 39 | } 40 | } 41 | } 42 | 43 | public String read(String path, Watcher watcher) throws InterruptedException, 44 | KeeperException { 45 | byte[] data = zk.getData(path, watcher, null/*stat*/); 46 | return new String(data, CHARSET); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/ResilientConfigUpdater.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.config; 2 | 3 | import java.io.IOException; 4 | import java.util.Random; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | import org.apache.zookeeper.KeeperException; 8 | 9 | public class ResilientConfigUpdater { 10 | 11 | public static final String PATH = "/config"; 12 | 13 | private ResilientActiveKeyValueStore store; 14 | private Random random = new Random(); 15 | 16 | public ResilientConfigUpdater(String hosts) throws IOException, 17 | InterruptedException { 18 | store = new ResilientActiveKeyValueStore(); 19 | store.connect(hosts); 20 | } 21 | 22 | public void run() throws InterruptedException, KeeperException { 23 | while (true) { 24 | String value = random.nextInt(100) + ""; 25 | store.write(PATH, value); 26 | System.out.printf("Set %s to %s\n", PATH, value); 27 | TimeUnit.SECONDS.sleep(random.nextInt(10)); 28 | } 29 | } 30 | 31 | //vv ResilientConfigUpdater 32 | public static void main(String[] args) throws Exception { 33 | /*[*/while (true) { 34 | try {/*]*/ 35 | ResilientConfigUpdater configUpdater = 36 | new ResilientConfigUpdater(args[0]); 37 | configUpdater.run(); 38 | /*[*/} catch (KeeperException.SessionExpiredException e) { 39 | // start a new session 40 | } catch (KeeperException e) { 41 | // already retried, so exit 42 | e.printStackTrace(); 43 | break; 44 | } 45 | }/*]*/ 46 | } 47 | //^^ ResilientConfigUpdater 48 | } 49 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/config/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 利用 ZooKeeper 作为分布式配置中心 3 | * 4 | * @author Zhang Peng 5 | * @since 2020-06-02 6 | */ 7 | package io.github.dunwu.bigdata.zk.config; 8 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/Callback.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | /** 4 | * Created by sunyujia@aliyun.com on 2016/2/23. 5 | */ 6 | public interface Callback { 7 | 8 | V onGetLock() throws InterruptedException; 9 | 10 | V onTimeout() throws InterruptedException; 11 | 12 | } 13 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/DLockTemplate.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | /** 4 | * 分布式锁模板类 Created by sunyujia@aliyun.com on 2016/2/23. 5 | */ 6 | public interface DLockTemplate { 7 | 8 | /** 9 | * @param lockId 锁id(对应业务唯一ID) 10 | * @param timeout 单位毫秒 11 | * @param callback 回调函数 12 | * @return 13 | */ 14 | V execute(String lockId, long timeout, Callback callback); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/DistributedLock.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | import java.util.concurrent.TimeUnit; 4 | 5 | /** 6 | * 分布式锁接口 7 | * 8 | * @author Zhang Peng 9 | * @since 2020-01-14 10 | */ 11 | public interface DistributedLock { 12 | 13 | void lock(); 14 | 15 | boolean tryLock(long timeout, TimeUnit unit); 16 | 17 | void unlock(); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/TimeoutHandler.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | /** 4 | * @author Zhang Peng 5 | * @since 2020-01-14 6 | */ 7 | public interface TimeoutHandler { 8 | 9 | V onTimeout() throws InterruptedException; 10 | 11 | } 12 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/ZkDLockTemplate.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | import org.apache.curator.framework.CuratorFramework; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | 7 | import java.util.concurrent.TimeUnit; 8 | 9 | /** 10 | * Created by sunyujia@aliyun.com on 2016/2/26. 11 | */ 12 | public class ZkDLockTemplate implements DLockTemplate { 13 | 14 | private static final Logger log = LoggerFactory.getLogger(ZkDLockTemplate.class); 15 | 16 | private final CuratorFramework client; 17 | 18 | public ZkDLockTemplate(CuratorFramework client) { 19 | this.client = client; 20 | } 21 | 22 | @Override 23 | public V execute(String lockId, long timeout, Callback callback) { 24 | ZookeeperReentrantDistributedLock lock = null; 25 | boolean getLock = false; 26 | try { 27 | lock = new ZookeeperReentrantDistributedLock(client, lockId); 28 | if (tryLock(lock, timeout)) { 29 | getLock = true; 30 | return callback.onGetLock(); 31 | } else { 32 | return callback.onTimeout(); 33 | } 34 | } catch (InterruptedException ex) { 35 | log.error(ex.getMessage(), ex); 36 | Thread.currentThread().interrupt(); 37 | } catch (Exception e) { 38 | log.error(e.getMessage(), e); 39 | } finally { 40 | if (getLock) { 41 | lock.unlock(); 42 | } 43 | } 44 | return null; 45 | } 46 | 47 | private boolean tryLock(ZookeeperReentrantDistributedLock lock, long timeout) { 48 | return lock.tryLock(timeout, TimeUnit.MILLISECONDS); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/ZkReentrantLockCleanerTask.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | import org.apache.curator.RetryPolicy; 4 | import org.apache.curator.framework.CuratorFramework; 5 | import org.apache.curator.framework.CuratorFrameworkFactory; 6 | import org.apache.curator.retry.ExponentialBackoffRetry; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.List; 10 | import java.util.TimerTask; 11 | import java.util.concurrent.Executors; 12 | import java.util.concurrent.ScheduledExecutorService; 13 | 14 | /** 15 | * Created by sunyujia@aliyun.com on 2016/2/25. 16 | */ 17 | public class ZkReentrantLockCleanerTask extends TimerTask { 18 | 19 | private static final org.slf4j.Logger log = LoggerFactory.getLogger(ZkReentrantLockCleanerTask.class); 20 | 21 | private CuratorFramework client; 22 | 23 | private ScheduledExecutorService executorService = Executors.newScheduledThreadPool(3); 24 | 25 | /** 26 | * 检查周期 27 | */ 28 | private long period = 5000; 29 | 30 | /** 31 | * Curator RetryPolicy maxRetries 32 | */ 33 | private int maxRetries = 3; 34 | 35 | /** 36 | * Curator RetryPolicy baseSleepTimeMs 37 | */ 38 | private final int baseSleepTimeMs = 1000; 39 | 40 | public ZkReentrantLockCleanerTask(String zookeeperAddress) { 41 | try { 42 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(baseSleepTimeMs, maxRetries); 43 | client = CuratorFrameworkFactory.newClient(zookeeperAddress, retryPolicy); 44 | client.start(); 45 | } catch (Exception e) { 46 | log.error(e.getMessage(), e); 47 | } catch (Throwable ex) { 48 | ex.printStackTrace(); 49 | log.error(ex.getMessage(), ex); 50 | } 51 | } 52 | 53 | public void start() { 54 | executorService.execute(this); 55 | } 56 | 57 | private boolean isEmpty(List list) { 58 | return list == null || list.isEmpty(); 59 | } 60 | 61 | @Override 62 | public void run() { 63 | try { 64 | List childrenPaths = this.client.getChildren().forPath(ZookeeperReentrantDistributedLock.ROOT_PATH); 65 | for (String path : childrenPaths) { 66 | cleanNode(path); 67 | } 68 | } catch (Exception e) { 69 | e.printStackTrace(); 70 | } 71 | } 72 | 73 | private void cleanNode(String path) { 74 | try { 75 | if (isEmpty(this.client.getChildren().forPath(path))) { 76 | this.client.delete().forPath(path);//利用存在子节点无法删除和zk的原子性这两个特性. 77 | } 78 | } catch (Exception e) { 79 | e.printStackTrace(); 80 | } 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/ZookeeperReentrantDistributedLock.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | import cn.hutool.core.collection.CollectionUtil; 4 | import org.apache.curator.framework.CuratorFramework; 5 | import org.apache.curator.framework.recipes.locks.InterProcessMutex; 6 | import org.slf4j.Logger; 7 | import org.slf4j.LoggerFactory; 8 | 9 | import java.util.List; 10 | import java.util.concurrent.Executors; 11 | import java.util.concurrent.ScheduledExecutorService; 12 | import java.util.concurrent.TimeUnit; 13 | 14 | /** 15 | * 基于Zookeeper的可重入互斥锁(关于重入:仅限于持有zk锁的jvm内重入) Created by sunyujia@aliyun.com on 2016/2/24. 16 | */ 17 | public class ZookeeperReentrantDistributedLock implements DistributedLock { 18 | 19 | private static final Logger log = LoggerFactory.getLogger(ZookeeperReentrantDistributedLock.class); 20 | 21 | /** 22 | * 线程池 23 | */ 24 | private static final ScheduledExecutorService executorService = Executors.newScheduledThreadPool(3); 25 | 26 | /** 27 | * 所有PERSISTENT锁节点的根位置 28 | */ 29 | public static final String ROOT_PATH = "/distributed_lock/"; 30 | 31 | /** 32 | * 每次延迟清理PERSISTENT节点的时间 Unit:MILLISECONDS 33 | */ 34 | private static final long DELAY_TIME_FOR_CLEAN = 1000; 35 | 36 | /** 37 | * zk 共享锁实现 38 | */ 39 | private InterProcessMutex interProcessMutex; 40 | 41 | /** 42 | * 锁的ID,对应zk一个PERSISTENT节点,下挂EPHEMERAL节点. 43 | */ 44 | private String path; 45 | 46 | /** 47 | * zk的客户端 48 | */ 49 | private CuratorFramework client; 50 | 51 | public ZookeeperReentrantDistributedLock(CuratorFramework client, String lockId) { 52 | this.client = client; 53 | this.path = ROOT_PATH + lockId; 54 | interProcessMutex = new InterProcessMutex(this.client, this.path); 55 | } 56 | 57 | @Override 58 | public void lock() { 59 | try { 60 | interProcessMutex.acquire(); 61 | } catch (Exception e) { 62 | log.error(e.getMessage(), e); 63 | } 64 | } 65 | 66 | @Override 67 | public boolean tryLock(long timeout, TimeUnit unit) { 68 | try { 69 | return interProcessMutex.acquire(timeout, unit); 70 | } catch (Exception e) { 71 | log.error(e.getMessage(), e); 72 | return false; 73 | } 74 | } 75 | 76 | @Override 77 | public void unlock() { 78 | try { 79 | interProcessMutex.release(); 80 | } catch (Throwable e) { 81 | log.error(e.getMessage(), e); 82 | } finally { 83 | executorService.schedule(new Cleaner(client, path), DELAY_TIME_FOR_CLEAN, TimeUnit.MILLISECONDS); 84 | } 85 | } 86 | 87 | static class Cleaner implements Runnable { 88 | 89 | private String path; 90 | 91 | private CuratorFramework client; 92 | 93 | public Cleaner(CuratorFramework client, String path) { 94 | this.path = path; 95 | this.client = client; 96 | } 97 | 98 | @Override 99 | public void run() { 100 | try { 101 | List list = client.getChildren().forPath(path); 102 | if (CollectionUtil.isEmpty(list)) { 103 | client.delete().forPath(path); 104 | } 105 | } catch (Exception e) { 106 | log.error(e.getMessage(), e); 107 | } 108 | } 109 | 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/dlock/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 利用 ZooKeeper 实现分布式锁 3 | * 4 | * @author Zhang Peng 5 | * @since 2020-06-02 6 | */ 7 | package io.github.dunwu.bigdata.zk.dlock; 8 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/id/DistributedId.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.id; 2 | 3 | /** 4 | * 分布式 ID 接口 5 | */ 6 | public interface DistributedId { 7 | 8 | Long generate(); 9 | 10 | } 11 | -------------------------------------------------------------------------------- /codes/zookeeper/src/main/java/io/github/dunwu/bigdata/zk/id/ZkDistributedId.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.id; 2 | 3 | import org.apache.curator.RetryPolicy; 4 | import org.apache.curator.framework.CuratorFramework; 5 | import org.apache.curator.framework.CuratorFrameworkFactory; 6 | import org.apache.curator.retry.ExponentialBackoffRetry; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | /** 11 | * ZooKeeper 实现的分布式ID生成器 12 | * 13 | * @author Zhang Peng 14 | * @since 2020-06-03 15 | */ 16 | public class ZkDistributedId implements DistributedId { 17 | 18 | private static final Logger log = LoggerFactory.getLogger(ZkDistributedId.class); 19 | 20 | private CuratorFramework client; 21 | 22 | /** 23 | * 最大尝试次数 24 | */ 25 | private final int MAX_RETRIES = 3; 26 | 27 | /** 28 | * 等待时间,单位:毫秒 29 | */ 30 | private final int BASE_SLEEP_TIME = 1000; 31 | 32 | /** 33 | * 默认 ID 存储目录 34 | */ 35 | public static final String DEFAULT_ID_PATH = "/dunwu:id"; 36 | 37 | public ZkDistributedId(String connectionString) { 38 | this(connectionString, DEFAULT_ID_PATH); 39 | } 40 | 41 | public ZkDistributedId(String connectionString, String path) { 42 | try { 43 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(BASE_SLEEP_TIME, MAX_RETRIES); 44 | client = CuratorFrameworkFactory.newClient(connectionString, retryPolicy); 45 | client.start(); 46 | // 自动创建 ID 存储目录 47 | client.create().forPath(path); 48 | } catch (Exception e) { 49 | log.error(e.getMessage(), e); 50 | } 51 | } 52 | 53 | @Override 54 | public Long generate() { 55 | try { 56 | int value = client.setData().withVersion(-1).forPath(DEFAULT_ID_PATH, "".getBytes()).getVersion(); 57 | return (long) value; 58 | } catch (Exception e) { 59 | e.printStackTrace(); 60 | } 61 | return null; 62 | } 63 | 64 | } 65 | -------------------------------------------------------------------------------- /codes/zookeeper/src/test/java/io/github/dunwu/bigdata/zk/dlock/ZkReentrantLockTemplateTest.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.dlock; 2 | 3 | import org.apache.curator.RetryPolicy; 4 | import org.apache.curator.framework.CuratorFramework; 5 | import org.apache.curator.framework.CuratorFrameworkFactory; 6 | import org.apache.curator.retry.ExponentialBackoffRetry; 7 | import org.junit.jupiter.api.Test; 8 | 9 | import java.util.concurrent.CountDownLatch; 10 | import java.util.concurrent.ThreadLocalRandom; 11 | 12 | /** 13 | * Created by sunyujia@aliyun.com on 2016/2/24. 14 | */ 15 | 16 | public class ZkReentrantLockTemplateTest { 17 | 18 | @Test 19 | public void testTry() throws InterruptedException { 20 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 21 | CuratorFramework client = CuratorFrameworkFactory.newClient("localhost:2181", retryPolicy); 22 | client.start(); 23 | 24 | final ZkDLockTemplate template = new ZkDLockTemplate(client); 25 | int size = 100; 26 | final CountDownLatch startCountDownLatch = new CountDownLatch(1); 27 | final CountDownLatch endDownLatch = new CountDownLatch(size); 28 | for (int i = 0; i < size; i++) { 29 | new Thread(() -> { 30 | try { 31 | startCountDownLatch.await(); 32 | } catch (InterruptedException e) { 33 | Thread.currentThread().interrupt(); 34 | } 35 | final int sleepTime = ThreadLocalRandom.current().nextInt(3) * 1000; 36 | 37 | template.execute("test", 3000, new Callback() { 38 | @Override 39 | public Object onGetLock() throws InterruptedException { 40 | System.out.println(Thread.currentThread().getName() + " 获取锁"); 41 | Thread.currentThread().sleep(sleepTime); 42 | System.out.println(Thread.currentThread().getName() + ":sleeped:" + sleepTime); 43 | endDownLatch.countDown(); 44 | return null; 45 | } 46 | 47 | @Override 48 | public Object onTimeout() throws InterruptedException { 49 | System.out.println(Thread.currentThread().getName() + " 获取锁"); 50 | Thread.currentThread().sleep(sleepTime); 51 | System.out.println(Thread.currentThread().getName() + ":sleeped:" + sleepTime); 52 | endDownLatch.countDown(); 53 | return null; 54 | } 55 | }); 56 | }).start(); 57 | } 58 | startCountDownLatch.countDown(); 59 | endDownLatch.await(); 60 | } 61 | 62 | public static void main(String[] args) { 63 | RetryPolicy retryPolicy = new ExponentialBackoffRetry(1000, 3); 64 | CuratorFramework client = CuratorFrameworkFactory.newClient("127.0.0.1:2181", retryPolicy); 65 | client.start(); 66 | 67 | final ZkDLockTemplate template = new ZkDLockTemplate(client);//本类多线程安全,可通过spring注入 68 | template.execute("订单流水号", 5000, new Callback() { 69 | @Override 70 | public Object onGetLock() throws InterruptedException { 71 | //TODO 获得锁后要做的事 72 | return null; 73 | } 74 | 75 | @Override 76 | public Object onTimeout() throws InterruptedException { 77 | //TODO 获得锁超时后要做的事 78 | return null; 79 | } 80 | }); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /codes/zookeeper/src/test/java/io/github/dunwu/bigdata/zk/id/ZkDistributedIdTest.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.id; 2 | 3 | import java.util.Set; 4 | import java.util.concurrent.*; 5 | 6 | /** 7 | * 并发测试生成分布式ID 8 | * 9 | * @author Zhang Peng 10 | * @since 2020-06-03 11 | */ 12 | public class ZkDistributedIdTest { 13 | 14 | public static void main(String[] args) throws ExecutionException, InterruptedException { 15 | DistributedId distributedId = new ZkDistributedId("localhost:2181"); 16 | 17 | final CountDownLatch latch = new CountDownLatch(10000); 18 | final ExecutorService executorService = Executors.newFixedThreadPool(20); 19 | long begin = System.nanoTime(); 20 | 21 | Set set = new ConcurrentSkipListSet<>(); 22 | for (int i = 0; i < 10000; i++) { 23 | Future future = executorService.submit(new MyThread(latch, distributedId)); 24 | set.add(future.get()); 25 | } 26 | 27 | try { 28 | latch.await(); 29 | executorService.shutdown(); 30 | 31 | long end = System.nanoTime(); 32 | long time = end - begin; 33 | System.out.println("ID 数:" + set.size()); 34 | System.out.println("耗时:" + TimeUnit.NANOSECONDS.toSeconds(time) + " 秒"); 35 | } catch (Exception e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | static class MyThread implements Callable { 41 | 42 | private final CountDownLatch latch; 43 | private final DistributedId distributedId; 44 | 45 | MyThread(CountDownLatch latch, DistributedId distributedId) { 46 | this.latch = latch; 47 | this.distributedId = distributedId; 48 | } 49 | 50 | @Override 51 | public Long call() { 52 | Long id = distributedId.generate(); 53 | latch.countDown(); 54 | return id; 55 | } 56 | 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /codes/zookeeper/src/test/java/io/github/dunwu/bigdata/zk/zookeeper/ZooKeeperTest.java: -------------------------------------------------------------------------------- 1 | package io.github.dunwu.bigdata.zk.zookeeper; 2 | 3 | import cn.hutool.core.collection.CollectionUtil; 4 | import org.apache.zookeeper.*; 5 | import org.apache.zookeeper.data.Stat; 6 | import org.junit.jupiter.api.*; 7 | 8 | import java.io.IOException; 9 | import java.util.List; 10 | import java.util.concurrent.CountDownLatch; 11 | 12 | /** 13 | * ZooKeeper 官方客户端测试例 14 | * 15 | * @author Zhang Peng 16 | * @since 2022-02-19 17 | */ 18 | @DisplayName("ZooKeeper 官方客户端测试例") 19 | public class ZooKeeperTest { 20 | 21 | /** 22 | * ZooKeeper 连接实例 23 | */ 24 | private static ZooKeeper zk = null; 25 | 26 | /** 27 | * 创建 ZooKeeper 连接 28 | */ 29 | @BeforeAll 30 | public static void init() throws IOException, InterruptedException { 31 | final String HOST = "localhost:2181"; 32 | CountDownLatch latch = new CountDownLatch(1); 33 | zk = new ZooKeeper(HOST, 5000, watcher -> { 34 | if (watcher.getState() == Watcher.Event.KeeperState.SyncConnected) { 35 | latch.countDown(); 36 | } 37 | }); 38 | latch.await(); 39 | } 40 | 41 | /** 42 | * 关闭 ZooKeeper 连接 43 | */ 44 | @AfterAll 45 | public static void destroy() throws InterruptedException { 46 | if (zk != null) { 47 | zk.close(); 48 | } 49 | } 50 | 51 | @Test 52 | @DisplayName("建立连接测试") 53 | public void getState() { 54 | ZooKeeper.States state = zk.getState(); 55 | Assertions.assertTrue(state.isAlive()); 56 | } 57 | 58 | private static final String path = "/mytest"; 59 | 60 | @Test 61 | @DisplayName("创建、删除节点测试") 62 | public void createAndDeleteNode() throws InterruptedException, KeeperException { 63 | 64 | // 创建节点 65 | String text = "My first zookeeper app"; 66 | zk.create(path, text.getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); 67 | 68 | // 判断节点是否存在 69 | Stat stat = zk.exists(path, true); 70 | Assertions.assertNotNull(stat); 71 | 72 | // 删除节点 73 | zk.delete(path, zk.exists(path, true).getVersion()); 74 | 75 | // 再次判断节点是否存在 76 | stat = zk.exists(path, true); 77 | Assertions.assertNull(stat); 78 | } 79 | 80 | @Test 81 | @DisplayName("设置、获取节点数据测试") 82 | public void setAndGetNodeData() throws InterruptedException, KeeperException { 83 | 84 | // 创建节点 85 | String text = "My first zookeeper app"; 86 | zk.create(path, text.getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); 87 | 88 | // 判断节点是否存在 89 | Stat stat = zk.exists(path, true); 90 | Assertions.assertNotNull(stat); 91 | 92 | // 获取节点数据 93 | byte[] data = zk.getData(path, false, null); 94 | Assertions.assertEquals(text, new String(data)); 95 | System.out.println("修改前的节点数据:" + new String(data)); 96 | 97 | // 设置节点数据 98 | String text2 = "Content is changed."; 99 | zk.setData(path, text2.getBytes(), zk.exists(path, true).getVersion()); 100 | 101 | // 再次获取节点数据 102 | byte[] data2 = zk.getData(path, false, null); 103 | Assertions.assertEquals(text2, new String(data2)); 104 | System.out.println("修改后的节点数据:" + new String(data2)); 105 | 106 | // 删除节点 107 | zk.delete(path, zk.exists(path, true).getVersion()); 108 | 109 | // 再次判断节点是否存在 110 | stat = zk.exists(path, true); 111 | Assertions.assertNull(stat); 112 | } 113 | 114 | @Test 115 | @DisplayName("获取节点的子节点测试") 116 | public void getChildren() throws InterruptedException, KeeperException { 117 | 118 | String text = "含子节点的节点"; 119 | zk.create(path, text.getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); 120 | zk.create(path + "/1", "1".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); 121 | zk.create(path + "/2", "1".getBytes(), ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); 122 | List children = zk.getChildren(path, false); 123 | for (String child : children) { 124 | System.out.println(child); 125 | } 126 | List expectedList = CollectionUtil.newArrayList("1", "2"); 127 | Assertions.assertTrue(CollectionUtil.containsAll(expectedList, children)); 128 | 129 | // 删除节点 130 | zk.delete(path + "/1", zk.exists(path + "/1", true).getVersion()); 131 | zk.delete(path + "/2", zk.exists(path + "/2", true).getVersion()); 132 | zk.delete(path, zk.exists(path, true).getVersion()); 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /codes/zookeeper/src/test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | ${LOG_MSG} 9 | 10 | 11 | 12 | 13 | DEBUG 14 | 15 | ${USER_HOME}/debug.log 16 | 17 | ${LOG_DIR}/debug%i.log 18 | 19 | 20MB 20 | 21 | 22 | 23 | ${LOG_MSG} 24 | 25 | 26 | 27 | ${USER_HOME}/info.log 28 | 29 | INFO 30 | 31 | 32 | ${LOG_DIR}/info%i.log 33 | 34 | 20MB 35 | 36 | 37 | 38 | ${LOG_MSG} 39 | 40 | 41 | 42 | ${USER_HOME}/error.log 43 | 44 | ERROR 45 | 46 | 47 | ${LOG_DIR}/error%i.log 48 | 49 | 20MB 50 | 51 | 52 | 53 | ${LOG_MSG} 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /docs/.markdownlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "default": true, 3 | "MD002": false, 4 | "MD004": { "style": "dash" }, 5 | "ul-indent": { "indent": 2 }, 6 | "MD013": { "line_length": 600 }, 7 | "MD024": false, 8 | "MD025": false, 9 | "MD026": false, 10 | "MD029": { "style": "ordered" }, 11 | "MD033": false, 12 | "MD034": false, 13 | "MD036": false, 14 | "fenced-code-language": false, 15 | "no-hard-tabs": false, 16 | "whitespace": false, 17 | "emphasis-style": { "style": "consistent" } 18 | } 19 | -------------------------------------------------------------------------------- /docs/.vuepress/config/baiduCode.js: -------------------------------------------------------------------------------- 1 | module.exports = ''; 2 | -------------------------------------------------------------------------------- /docs/.vuepress/config/htmlModules.js: -------------------------------------------------------------------------------- 1 | /** 插入自定义html模块 (可用于插入广告模块等) 2 | * { 3 | * homeSidebarB: htmlString, 首页侧边栏底部 4 | * 5 | * sidebarT: htmlString, 全局左侧边栏顶部 6 | * sidebarB: htmlString, 全局左侧边栏底部 7 | * 8 | * pageT: htmlString, 全局页面顶部 9 | * pageB: htmlString, 全局页面底部 10 | * pageTshowMode: string, 页面顶部-显示方式:未配置默认全局;'article' => 仅文章页①; 'custom' => 仅自定义页① 11 | * pageBshowMode: string, 页面底部-显示方式:未配置默认全局;'article' => 仅文章页①; 'custom' => 仅自定义页① 12 | * 13 | * windowLB: htmlString, 全局左下角② 14 | * windowRB: htmlString, 全局右下角② 15 | * } 16 | * 17 | * ①注:在.md文件front matter配置`article: false`的页面是自定义页,未配置的默认是文章页(首页除外)。 18 | * ②注:windowLB 和 windowRB:1.展示区块最大宽高200px*400px。2.请给自定义元素定一个不超过200px*400px的宽高。3.在屏幕宽度小于960px时无论如何都不会显示。 19 | */ 20 | 21 | module.exports = { 22 | // 万维广告 23 | pageB: ` 24 |
25 | 29 | `, 30 | windowRB: ` 31 |
33 | 41 | `, 42 | } 43 | 44 | // module.exports = { 45 | // homeSidebarB: `
自定义模块测试
`, 46 | // sidebarT: `
自定义模块测试
`, 47 | // sidebarB: `
自定义模块测试
`, 48 | // pageT: `
自定义模块测试
`, 49 | // pageB: `
自定义模块测试
`, 50 | // windowLB: `
自定义模块测试
`, 51 | // windowRB: `
自定义模块测试
`, 52 | // } 53 | -------------------------------------------------------------------------------- /docs/.vuepress/plugins/love-me/index.js: -------------------------------------------------------------------------------- 1 | const path = require('path') 2 | const LoveMyPlugin = (options = {}) => ({ 3 | define() { 4 | const COLOR = 5 | options.color || 6 | 'rgb(' + ~~(255 * Math.random()) + ',' + ~~(255 * Math.random()) + ',' + ~~(255 * Math.random()) + ')' 7 | const EXCLUDECLASS = options.excludeClassName || '' 8 | return { COLOR, EXCLUDECLASS } 9 | }, 10 | enhanceAppFiles: [path.resolve(__dirname, 'love-me.js')], 11 | }) 12 | module.exports = LoveMyPlugin 13 | -------------------------------------------------------------------------------- /docs/.vuepress/plugins/love-me/love-me.js: -------------------------------------------------------------------------------- 1 | export default () => { 2 | if (typeof window !== "undefined") { 3 | (function(e, t, a) { 4 | function r() { 5 | for (var e = 0; e < s.length; e++) s[e].alpha <= 0 ? (t.body.removeChild(s[e].el), s.splice(e, 1)) : (s[e].y--, s[e].scale += .004, s[e].alpha -= .013, s[e].el.style.cssText = "left:" + s[e].x + "px;top:" + s[e].y + "px;opacity:" + s[e].alpha + ";transform:scale(" + s[e].scale + "," + s[e].scale + ") rotate(45deg);background:" + s[e].color + ";z-index:99999"); 6 | requestAnimationFrame(r) 7 | } 8 | function n() { 9 | var t = "function" == typeof e.onclick && e.onclick; 10 | 11 | e.onclick = function(e) { 12 | // 过滤指定元素 13 | let mark = true; 14 | EXCLUDECLASS && e.path && e.path.forEach((item) =>{ 15 | if(item.nodeType === 1) { 16 | typeof item.className === 'string' && item.className.indexOf(EXCLUDECLASS) > -1 ? mark = false : '' 17 | } 18 | }) 19 | 20 | if(mark) { 21 | t && t(), 22 | o(e) 23 | } 24 | } 25 | } 26 | function o(e) { 27 | var a = t.createElement("div"); 28 | a.className = "heart", 29 | s.push({ 30 | el: a, 31 | x: e.clientX - 5, 32 | y: e.clientY - 5, 33 | scale: 1, 34 | alpha: 1, 35 | color: COLOR 36 | }), 37 | t.body.appendChild(a) 38 | } 39 | function i(e) { 40 | var a = t.createElement("style"); 41 | a.type = "text/css"; 42 | try { 43 | a.appendChild(t.createTextNode(e)) 44 | } catch(t) { 45 | a.styleSheet.cssText = e 46 | } 47 | t.getElementsByTagName("head")[0].appendChild(a) 48 | } 49 | // function c() { 50 | // return "rgb(" + ~~ (255 * Math.random()) + "," + ~~ (255 * Math.random()) + "," + ~~ (255 * Math.random()) + ")" 51 | // } 52 | var s = []; 53 | e.requestAnimationFrame = e.requestAnimationFrame || e.webkitRequestAnimationFrame || e.mozRequestAnimationFrame || e.oRequestAnimationFrame || e.msRequestAnimationFrame || 54 | function(e) { 55 | setTimeout(e, 1e3 / 60) 56 | }, 57 | i(".heart{width: 10px;height: 10px;position: fixed;background: #f00;transform: rotate(45deg);-webkit-transform: rotate(45deg);-moz-transform: rotate(45deg);}.heart:after,.heart:before{content: '';width: inherit;height: inherit;background: inherit;border-radius: 50%;-webkit-border-radius: 50%;-moz-border-radius: 50%;position: fixed;}.heart:after{top: -5px;}.heart:before{left: -5px;}"), 58 | n(), 59 | r() 60 | })(window, document) 61 | } 62 | } -------------------------------------------------------------------------------- /docs/.vuepress/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/favicon.ico -------------------------------------------------------------------------------- /docs/.vuepress/public/img/bg.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/img/bg.gif -------------------------------------------------------------------------------- /docs/.vuepress/public/img/dunwu-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/img/dunwu-logo.png -------------------------------------------------------------------------------- /docs/.vuepress/public/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/img/favicon.ico -------------------------------------------------------------------------------- /docs/.vuepress/public/img/more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/img/more.png -------------------------------------------------------------------------------- /docs/.vuepress/public/img/other.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dunwu/bigdata-tutorial/d4f1357fff196a5e2e624e18fd5e2b5973464e4e/docs/.vuepress/public/img/other.png -------------------------------------------------------------------------------- /docs/.vuepress/public/markmap/01.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Markmap 8 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/.vuepress/styles/index.styl: -------------------------------------------------------------------------------- 1 | .home-wrapper .banner .banner-conent .hero h1{ 2 | font-size 2.8rem!important 3 | } 4 | // 文档中适配 5 | table 6 | width auto 7 | .page >*:not(.footer),.card-box 8 | box-shadow: none!important 9 | 10 | .page 11 | @media (min-width $contentWidth + 80) 12 | padding-top $navbarHeight!important 13 | .home-wrapper .banner .banner-conent 14 | padding 0 2.9rem 15 | box-sizing border-box 16 | .home-wrapper .banner .slide-banner .slide-banner-wrapper .slide-item a 17 | h2 18 | margin-top 2rem 19 | font-size 1.2rem!important 20 | p 21 | padding 0 1rem 22 | 23 | // 评论区颜色重置 24 | .gt-container 25 | .gt-ico-tip 26 | &::after 27 | content: '。( Win + . ) or ( ⌃ + ⌘ + ␣ ) open Emoji' 28 | color: #999 29 | .gt-meta 30 | border-color var(--borderColor)!important 31 | .gt-comments-null 32 | color var(--textColor) 33 | opacity .5 34 | .gt-header-textarea 35 | color var(--textColor) 36 | background rgba(180,180,180,0.1)!important 37 | .gt-btn 38 | border-color $accentColor!important 39 | background-color $accentColor!important 40 | .gt-btn-preview 41 | background-color rgba(255,255,255,0)!important 42 | color $accentColor!important 43 | a 44 | color $accentColor!important 45 | .gt-svg svg 46 | fill $accentColor!important 47 | .gt-comment-content,.gt-comment-admin .gt-comment-content 48 | background-color rgba(150,150,150,0.1)!important 49 | &:hover 50 | box-shadow 0 0 25px rgba(150,150,150,.5)!important 51 | .gt-comment-body 52 | color var(--textColor)!important 53 | 54 | 55 | // qq徽章 56 | .qq 57 | position: relative; 58 | .qq::after 59 | content: "可撩"; 60 | background: $accentColor; 61 | color:#fff; 62 | padding: 0 5px; 63 | border-radius: 10px; 64 | font-size:12px; 65 | position: absolute; 66 | top: -4px; 67 | right: -35px; 68 | transform:scale(0.85); 69 | 70 | // demo模块图标颜色 71 | body .vuepress-plugin-demo-block__wrapper 72 | &,.vuepress-plugin-demo-block__display 73 | border-color rgba(160,160,160,.3) 74 | .vuepress-plugin-demo-block__footer:hover 75 | .vuepress-plugin-demo-block__expand::before 76 | border-top-color: $accentColor !important; 77 | border-bottom-color: $accentColor !important; 78 | svg 79 | fill: $accentColor !important; 80 | 81 | 82 | // 全文搜索框 83 | .suggestions 84 | overflow: auto 85 | max-height: calc(100vh - 6rem) 86 | @media (max-width: 719px) { 87 | width: 90vw; 88 | min-width: 90vw!important; 89 | margin-right: -20px; 90 | } 91 | .highlight 92 | color: $accentColor 93 | font-weight: bold 94 | -------------------------------------------------------------------------------- /docs/.vuepress/styles/palette.styl: -------------------------------------------------------------------------------- 1 | 2 | // 原主题变量已弃用,以下是vdoing使用的变量,你可以在这个文件内修改它们。 3 | 4 | //***vdoing主题-变量***// 5 | 6 | // // 颜色 7 | 8 | // $bannerTextColor = #fff // 首页banner区(博客标题)文本颜色 9 | // $accentColor = #11A8CD 10 | // $arrowBgColor = #ccc 11 | // $badgeTipColor = #42b983 12 | // $badgeWarningColor = darken(#ffe564, 35%) 13 | // $badgeErrorColor = #DA5961 14 | 15 | // // 布局 16 | // $navbarHeight = 3.6rem 17 | // $sidebarWidth = 18rem 18 | // $contentWidth = 860px 19 | // $homePageWidth = 1100px 20 | // $rightMenuWidth = 230px // 右侧菜单 21 | 22 | // // 代码块 23 | // $lineNumbersWrapperWidth = 2.5rem 24 | 25 | // 浅色模式 26 | .theme-mode-light 27 | --bodyBg: rgba(255,255,255,1) 28 | --mainBg: rgba(255,255,255,1) 29 | --sidebarBg: rgba(255,255,255,.8) 30 | --blurBg: rgba(255,255,255,.9) 31 | --textColor: #004050 32 | --textLightenColor: #0085AD 33 | --borderColor: rgba(0,0,0,.15) 34 | --codeBg: #f6f6f6 35 | --codeColor: #525252 36 | codeThemeLight() 37 | 38 | // 深色模式 39 | .theme-mode-dark 40 | --bodyBg: rgba(30,30,34,1) 41 | --mainBg: rgba(30,30,34,1) 42 | --sidebarBg: rgba(30,30,34,.8) 43 | --blurBg: rgba(30,30,34,.8) 44 | --textColor: rgb(140,140,150) 45 | --textLightenColor: #0085AD 46 | --borderColor: #2C2C3A 47 | --codeBg: #252526 48 | --codeColor: #fff 49 | codeThemeDark() 50 | 51 | // 阅读模式 52 | .theme-mode-read 53 | --bodyBg: rgba(245,245,213,1) 54 | --mainBg: rgba(245,245,213,1) 55 | --sidebarBg: rgba(245,245,213,.8) 56 | --blurBg: rgba(245,245,213,.9) 57 | --textColor: #004050 58 | --textLightenColor: #0085AD 59 | --borderColor: rgba(0,0,0,.15) 60 | --codeBg: #282c34 61 | --codeColor: #fff 62 | codeThemeDark() 63 | -------------------------------------------------------------------------------- /docs/16.大数据/00.综合/01.大数据简介.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 大数据简介 3 | date: 2019-05-07 20:19:25 4 | categories: 5 | - 大数据 6 | - 综合 7 | tags: 8 | - 大数据 9 | - 综合 10 | permalink: /pages/9ab9da/ 11 | --- 12 | 13 | # 大数据简介 14 | 15 | ## 简介 16 | 17 | ### 什么是大数据 18 | 19 | 大数据是指超出传统数据库工具收集、存储、管理和分析能力的数据集。与此同时,及时采集、存储、聚合、管理数据,以及对数据深度分析的新技术和新能力,正在快速增长,就像预测计算芯片增长速度的摩尔定律一样。 20 | 21 | - **Volume** - 数据规模巨大 22 | - **Velocity** - 生成和处理速度极快 23 | - **Variety** - 数据规模巨大 24 | - **Value** - 生成和处理速度极快 25 | 26 | ### 应用场景 27 | 28 | 基于大数据的数据仓库 29 | 30 | 基于大数据的实时流处理 31 | 32 | ### Hadoop 编年史 33 | 34 | | 时间 | 事件 | 35 | | :------ | :---------------------------------------------------------------------- | 36 | | 2003.01 | Google 发表了 Google File System 论文 | 37 | | 2004.01 | Google 发表了 MapReduce 论文 | 38 | | 2006.02 | Apache Hadoop 项目正式启动,并支持 MapReduce 和 HDFS 独立发展 | 39 | | 2006.11 | Google 发表了 Bigtable 论文 | 40 | | 2008.01 | Hadoop 成为 Apache 顶级项目 | 41 | | 2009.03 | Cloudera 推出世界上首个 Hadoop 发行版——CDH,并完全开放源码 | 42 | | 2012.03 | HDFS NameNode HA 加入 Hadoop 主版本 | 43 | | 2014.02 | Spark 代替 MapReduce 成为 Hadoop 的缺省计算引擎,并成为 Apache 顶级项目 | 44 | 45 | ## 技术体系 46 | 47 | ### HDFS 48 | 49 | **概念** 50 | 51 | - Hadoop 分布式文件系统(Hadoop Distributed File System) 52 | - 在开源大数据技术体系中,地位无可替代 53 | 54 | **特点** 55 | 56 | - 高容错:数据多副本,副本丢失后自动恢复 57 | - 高可用:NameNode HA,安全模式 58 | - 高扩展:10K 节点规模 59 | - 简单一致性模型:一次写入多次读取,支持追加,不允许修改 60 | - 流式数据访问:批量读而非随机读,关注吞吐量而非时间 61 | - 大规模数据集:典型文件大小 GB\~TB 级,百万以上文件数量, PB 以上数据规模 62 | - 构建成本低且安全可靠:运行在大量的廉价商用机器上,硬件错误是常态,提供容错机制 63 | 64 | ### MapReduce 65 | 66 | **概念** 67 | 68 | - 面向批处理的分布式计算框架 69 | - 编程模型:将 MapReduce 程序分为 Map、Reduce 两个阶段 70 | 71 | **核心思想** 72 | 73 | - 分而治之,分布式计算 74 | - 移动计算,而非移动数据 75 | 76 | **特点** 77 | 78 | - 高容错:任务失败,自动调度到其他节点重新执行 79 | - 高扩展:计算能力随着节点数增加,近似线性递增 80 | - 适用于海量数据的离线批处理 81 | - 降低了分布式编程的门槛 82 | 83 | ### Spark 84 | 85 | 高性能分布式通用计算引擎 86 | 87 | - Spark Core - 基础计算框架(批处理、交互式分析) 88 | - Spark SQL - SQL 引擎(海量结构化数据的高性能查询) 89 | - Spark Streaming - 实时流处理(微批) 90 | - Spark MLlib - 机器学习 91 | - Spark GraphX - 图计算 92 | 93 | 采用 Scala 语言开发 94 | 95 | **特点** 96 | 97 | - 计算高效 - 内存计算、Cache 缓存机制、DAG 引擎、多线程池模型 98 | - 通用易用 - 适用于批处理、交互式计算、流处理、机器学习、图计算等多种场景 99 | - 运行模式多样 - Local、Standalone、YARN/Mesos 100 | 101 | ### YARN 102 | 103 | **概念** 104 | 105 | - Yet Another Resource Negotiator,另一种资源管理器 106 | - 为了解决 Hadoop 1.x 中 MapReduce 的先天缺陷 107 | - 分布式通用资源管理系统 108 | - 负责集群资源的统一管理 109 | - 从 Hadoop 2.x 开始,YARN 成为 Hadoop 的核心组件 110 | 111 | **特点** 112 | 113 | - 专注于资源管理和作业调度 114 | - 通用 - 适用各种计算框架,如 - MapReduce、Spark 115 | - 高可用 - ResourceManager 高可用、HDFS 高可用 116 | - 高扩展 117 | 118 | ### Hive 119 | 120 | **概念** 121 | 122 | - Hadoop 数据仓库 - 企业决策支持 123 | - SQL 引擎 - 对海量结构化数据进行高性能的 SQL 查询 124 | - 采用 HDFS 或 HBase 为数据存储 125 | - 采用 MapReduce 或 Spark 为计算框架 126 | 127 | **特点** 128 | 129 | - 提供类 SQL 查询语言 130 | - 支持命令行或 JDBC/ODBC 131 | - 提供灵活的扩展性 132 | - 提供复杂数据类型、扩展函数、脚本等 133 | 134 | ### HBase 135 | 136 | **概念** 137 | 138 | - Hadoop Database 139 | - Google BigTable 的开源实现 140 | - 分布式 NoSQL 数据库 141 | - 列式存储 - 主要用于半结构化、非结构化数据 142 | - 采用 HDFS 为文件存储系统 143 | 144 | **特点** 145 | 146 | - 高性能 - 支持高并发写入和查询 147 | - 高可用 - HDFS 高可用、Region 高可用 148 | - 高扩展 - 数据自动切分和分布,可动态扩容,无需停机 149 | - 海量存储 - 单表可容纳数十亿行,上百万列 150 | 151 | ### ElasticSearch 152 | 153 | - 开源的分布式全文检索引擎 154 | - 基于 Lucene 实现全文数据的快速存储、搜索和分析 155 | - 处理大规模数据 - PB 级以上 156 | - 具有较强的扩展性,集群规模可达上百台 157 | - 首选的分布式搜索引擎 158 | 159 | ## 术语 160 | 161 | **数据仓库(Data Warehouse)** - 数据仓库,是为企业所有级别的决策制定过程,提供所有类型数据支持的战略集合。它是单个数据存储,出于分析性报告和决策支持目的而创建。 为需要业务智能的企业,提供指导业务流程改进、监视时间、成本、质量以及控制。 162 | 163 | ## 资源 164 | 165 | - [awesome-bigdata](https://github.com/onurakpolat/awesome-bigdata) 166 | - [Hadoop](http://hadoop.apache.org/) 167 | - [HBase](http://hbase.apache.org/) 168 | - [Hive](http://hive.apache.org/) 169 | - [Impala](http://impala.apache.org/) 170 | - [Flume](http://flume.apache.org/) 171 | - [Kafka](http://kafka.apache.org/) 172 | - [Spark](http://spark.apache.org/) 173 | - [Sqoop](http://sqoop.apache.org/) 174 | - [ElasticSearch](https://www.elastic.co/guide/index.html) 175 | -------------------------------------------------------------------------------- /docs/16.大数据/00.综合/02.大数据学习.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 大数据学习 3 | date: 2020-06-22 00:22:25 4 | categories: 5 | - 大数据 6 | - 综合 7 | tags: 8 | - 大数据 9 | - 综合 10 | - 学习 11 | permalink: /pages/e0d035/ 12 | --- 13 | 14 | # 大数据学习路线 15 | 16 | ## 大数据简介 17 | 18 | ### 移动计算 19 | 20 | 传统的软件计算处理模型,都是“输入 -> 计算 -> 输出”模型。 21 | 22 | 如何解决 PB 级数据进行计算的问题呢? 23 | 24 | 采用分布式集群的解决方案,用数千台甚至上万台计算机构建一个大数据计算处理集群,利用更多的网络带宽、内存空间、磁盘容量、CPU 核心数去进行计算处理。 25 | 26 | 大数据计算处理通常针对的是网站的存量数据,网站大数据系统要做的就是将这些统计规律和关联关系计算出来,并由此进一步改善网站的用户体验和运营决策。 27 | 28 | 将程序分发到数据所在的地方进行计算,也就是所谓的移动计算比移动数据更划算。 29 | 30 | ### 大数据存储 31 | 32 | 大规模数据存储的核心问题: 33 | 34 | - 数据存储容量 35 | - 数据读写速度 36 | - 数据可靠性 37 | 38 | 解决方案:水平伸缩 39 | 40 | ## 大数据处理流程 41 | 42 | ![img](https://raw.githubusercontent.com/dunwu/images/master/snap/20220217114216.png) 43 | 44 | ### 1.1 数据采集 45 | 46 | 大数据处理的第一步是数据的收集。现在的中大型项目通常采用微服务架构进行分布式部署,所以数据的采集需要在多台服务器上进行,且采集过程不能影响正常业务的开展。基于这种需求,就衍生了多种日志收集工具,如 Flume 、Logstash、Kibana 等,它们都能通过简单的配置完成复杂的数据收集和数据聚合。 47 | 48 | ### 1.2 数据存储 49 | 50 | 收集到数据后,下一个问题就是:数据该如何进行存储?通常大家最为熟知是 MySQL、Oracle 等传统的关系型数据库,它们的优点是能够快速存储结构化的数据,并支持随机访问。但大数据的数据结构通常是半结构化(如日志数据)、甚至是非结构化的(如视频、音频数据),为了解决海量半结构化和非结构化数据的存储,衍生了 Hadoop HDFS 、KFS、GFS 等分布式文件系统,它们都能够支持结构化、半结构和非结构化数据的存储,并可以通过增加机器进行横向扩展。 51 | 52 | 分布式文件系统完美地解决了海量数据存储的问题,但是一个优秀的数据存储系统需要同时考虑数据存储和访问两方面的问题,比如你希望能够对数据进行随机访问,这是传统的关系型数据库所擅长的,但却不是分布式文件系统所擅长的,那么有没有一种存储方案能够同时兼具分布式文件系统和关系型数据库的优点,基于这种需求,就产生了 HBase、MongoDB。 53 | 54 | ### 1.3 数据分析 55 | 56 | 大数据处理最重要的环节就是数据分析,数据分析通常分为两种:批处理和流处理。 57 | 58 | - **批处理**:对一段时间内海量的离线数据进行统一的处理,对应的处理框架有 Hadoop MapReduce、Spark、Flink 等; 59 | - **流处理**:对运动中的数据进行处理,即在接收数据的同时就对其进行处理,对应的处理框架有 Storm、Spark Streaming、Flink Streaming 等。 60 | 61 | 批处理和流处理各有其适用的场景,时间不敏感或者硬件资源有限,可以采用批处理;时间敏感和实时性要求高就可以采用流处理。随着服务器硬件的价格越来越低和大家对及时性的要求越来越高,流处理越来越普遍,如股票价格预测和电商运营数据分析等。 62 | 63 | 上面的框架都是需要通过编程来进行数据分析,那么如果你不是一个后台工程师,是不是就不能进行数据的分析了?当然不是,大数据是一个非常完善的生态圈,有需求就有解决方案。为了能够让熟悉 SQL 的人员也能够进行数据的分析,查询分析框架应运而生,常用的有 Hive 、Spark SQL 、Flink SQL、 Pig、Phoenix 等。这些框架都能够使用标准的 SQL 或者 类 SQL 语法灵活地进行数据的查询分析。这些 SQL 经过解析优化后转换为对应的作业程序来运行,如 Hive 本质上就是将 SQL 转换为 MapReduce 作业,Spark SQL 将 SQL 转换为一系列的 RDDs 和转换关系(transformations),Phoenix 将 SQL 查询转换为一个或多个 HBase Scan。 64 | 65 | ### 1.4 数据应用 66 | 67 | 数据分析完成后,接下来就是数据应用的范畴,这取决于你实际的业务需求。比如你可以将数据进行可视化展现,或者将数据用于优化你的推荐算法,这种运用现在很普遍,比如短视频个性化推荐、电商商品推荐、头条新闻推荐等。当然你也可以将数据用于训练你的机器学习模型,这些都属于其他领域的范畴,都有着对应的框架和技术栈进行处理,这里就不一一赘述。 68 | 69 | ### 1.5 其他框架 70 | 71 | 上面是一个标准的大数据处理流程所用到的技术框架。但是实际的大数据处理流程比上面复杂很多,针对大数据处理中的各种复杂问题分别衍生了各类框架: 72 | 73 | - 单机的处理能力都是存在瓶颈的,所以大数据框架都是采用集群模式进行部署,为了更方便的进行集群的部署、监控和管理,衍生了 Ambari、Cloudera Manager 等集群管理工具; 74 | - 想要保证集群高可用,需要用到 ZooKeeper ,ZooKeeper 是最常用的分布式协调服务,它能够解决大多数集群问题,包括首领选举、失败恢复、元数据存储及其一致性保证。同时针对集群资源管理的需求,又衍生了 Hadoop YARN ; 75 | - 复杂大数据处理的另外一个显著的问题是,如何调度多个复杂的并且彼此之间存在依赖关系的作业?基于这种需求,产生了 Azkaban 和 Oozie 等工作流调度框架; 76 | - 大数据流处理中使用的比较多的另外一个框架是 Kafka,它可以用于消峰,避免在秒杀等场景下并发数据对流处理程序造成冲击; 77 | - 另一个常用的框架是 Sqoop ,主要是解决了数据迁移的问题,它能够通过简单的命令将关系型数据库中的数据导入到 HDFS 、Hive 或 HBase 中,或者从 HDFS 、Hive 导出到关系型数据库上。 78 | 79 | ## 大数据学习路线 80 | 81 | ### 框架分类 82 | 83 | **日志收集框架**:Flume 、Logstash、Kibana 84 | 85 | **分布式文件存储系统**:Hadoop HDFS 86 | 87 | **数据库系统**:Mongodb、HBase 88 | 89 | **分布式计算框架**: 90 | 91 | - 批处理框架:Hadoop MapReduce 92 | - 流处理框架:Storm 93 | - 混合处理框架:Spark、Flink 94 | 95 | **查询分析框架**:Hive 、Spark SQL 、Flink SQL、 Pig、Phoenix 96 | 97 | **集群资源管理器**:Hadoop YARN 98 | 99 | **分布式协调服务**:Zookeeper 100 | 101 | **数据迁移工具**:Sqoop 102 | 103 | **任务调度框架**:Azkaban、Oozie 104 | 105 | **集群部署和监控**:Ambari、Cloudera Manager 106 | 107 | 上面列出的都是比较主流的大数据框架,社区都很活跃,学习资源也比较丰富。建议从 Hadoop 开始入门学习,因为它是整个大数据生态圈的基石,其它框架都直接或者间接依赖于 Hadoop 。接着就可以学习计算框架,Spark 和 Flink 都是比较主流的混合处理框架,Spark 出现得较早,所以其应用也比较广泛。 Flink 是当下最火热的新一代的混合处理框架,其凭借众多优异的特性得到了众多公司的青睐。两者可以按照你个人喜好或者实际工作需要进行学习。 108 | 109 | ![img](https://raw.githubusercontent.com/dunwu/images/master/snap/20200601160917.png) 110 | 111 | ### 学习资料 112 | 113 | 大数据最权威和最全面的学习资料就是官方文档。热门的大数据框架社区都比较活跃、版本更新迭代也比较快,所以其出版物都明显滞后于其实际版本,基于这个原因采用书本学习不是一个最好的方案。比较庆幸的是,大数据框架的官方文档都写的比较好,内容完善,重点突出,同时都采用了大量配图进行辅助讲解。当然也有一些优秀的书籍历经时间的检验,至今依然很经典,这里列出部分个人阅读过的经典书籍: 114 | 115 | - [《hadoop 权威指南 (第四版)》](https://book.douban.com/subject/27115351/) 2017 年 116 | - [《Kafka 权威指南》](https://item.jd.com/12270295.html) 2017 年 117 | - [《从 Paxos 到 Zookeeper 分布式一致性原理与实践》](https://item.jd.com/11622772.html) 2015 年 118 | - [《Spark 技术内幕 深入解析 Spark 内核架构设计与实现原理》](https://book.douban.com/subject/26649141/) 2015 年 119 | - [《Spark.The.Definitive.Guide》](https://book.douban.com/subject/27035127/) 2018 年 120 | - [《HBase 权威指南》](https://book.douban.com/subject/10748460/) 2012 年 121 | - [《Hive 编程指南》](https://book.douban.com/subject/25791255/) 2013 年 122 | 123 | ### 视频学习资料 124 | 125 | 上面我推荐的都是书籍学习资料,很少推荐视频学习资料,这里说明一下原因:因为书籍历经时间的考验,能够再版的或者豆瓣等平台评价高的证明都是被大众所认可的,从概率的角度上来说,其必然更加优秀,不容易浪费大家的学习时间和精力,所以我个人更倾向于官方文档或者书本的学习方式,而不是视频。因为视频学习资料,缺少一个公共的评价平台和完善的评价机制,所以其质量良莠不齐。但是视频任然有其不可替代的好处,学习起来更直观、印象也更深刻,所以对于习惯视频学习的小伙伴,这里我各推荐一个免费的和付费的视频学习资源,大家按需选择: 126 | 127 | - 免费学习资源:尚硅谷大数据学习路线 —— [下载链接](http://www.atguigu.com/bigdata_video.shtml#bigdata) \ [在线观看链接](https://space.bilibili.com/302417610/) 128 | - 付费学习资源:[慕课网 Michael PK 的系列课程](https://www.imooc.com/t/2781843) 129 | 130 | ## 参考资料 131 | 132 | - [大数据学习路线](https://github.com/heibaiying/BigData-Notes/blob/master/notes/%E5%A4%A7%E6%95%B0%E6%8D%AE%E5%AD%A6%E4%B9%A0%E8%B7%AF%E7%BA%BF.md) 133 | -------------------------------------------------------------------------------- /docs/16.大数据/00.综合/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 大数据综合 3 | date: 2023-02-10 14:38:43 4 | categories: 5 | - 大数据 6 | - 综合 7 | tags: 8 | - 大数据 9 | - 综合 10 | permalink: /pages/ad9b6a/ 11 | hidden: true 12 | --- 13 | 14 | # 大数据综合 15 | 16 | ## 📖 内容 17 | 18 | - [大数据简介](01.大数据简介.md) 19 | - [大数据学习](02.大数据学习.md) 20 | 21 | ## 📚 资料 22 | 23 | ## 🚪 传送 24 | 25 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [我的博客](https://github.com/dunwu/blog) ◾ 26 | -------------------------------------------------------------------------------- /docs/16.大数据/01.hadoop/01.hdfs/02.HDFS运维.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: HDFS 运维 3 | date: 2020-02-24 21:14:47 4 | categories: 5 | - 大数据 6 | - hadoop 7 | - hdfs 8 | tags: 9 | - 大数据 10 | - Hadoop 11 | - HDFS 12 | permalink: /pages/90aeb6/ 13 | --- 14 | 15 | # HDFS 运维 16 | 17 | ## HDFS 命令 18 | 19 | ### 显示当前目录结构 20 | 21 | ```shell 22 | # 显示当前目录结构 23 | hdfs dfs -ls 24 | # 递归显示当前目录结构 25 | hdfs dfs -ls -R 26 | # 显示根目录下内容 27 | hdfs dfs -ls / 28 | ``` 29 | 30 | ### 创建目录 31 | 32 | ```shell 33 | # 创建目录 34 | hdfs dfs -mkdir 35 | # 递归创建目录 36 | hdfs dfs -mkdir -p 37 | ``` 38 | 39 | ### 删除操作 40 | 41 | ```shell 42 | # 删除文件 43 | hdfs dfs -rm 44 | # 递归删除目录和文件 45 | hdfs dfs -rm -R 46 | ``` 47 | 48 | ### 导入文件到 HDFS 49 | 50 | ```shell 51 | # 二选一执行即可 52 | hdfs dfs -put [localsrc] [dst] 53 | hdfs dfs -copyFromLocal [localsrc] [dst] 54 | ``` 55 | 56 | ### 从 HDFS 导出文件 57 | 58 | ```shell 59 | # 二选一执行即可 60 | hdfs dfs -get [dst] [localsrc] 61 | hdfs dfs -copyToLocal [dst] [localsrc] 62 | ``` 63 | 64 | ### 查看文件内容 65 | 66 | ```shell 67 | # 二选一执行即可 68 | hdfs dfs -text 69 | hdfs dfs -cat 70 | ``` 71 | 72 | ### 显示文件的最后一千字节 73 | 74 | ```shell 75 | hdfs dfs -tail 76 | # 和Linux下一样,会持续监听文件内容变化 并显示文件的最后一千字节 77 | hdfs dfs -tail -f 78 | ``` 79 | 80 | ### 拷贝文件 81 | 82 | ```shell 83 | hdfs dfs -cp [src] [dst] 84 | ``` 85 | 86 | ### 移动文件 87 | 88 | ```shell 89 | hdfs dfs -mv [src] [dst] 90 | ``` 91 | 92 | ### 统计当前目录下各文件大小 93 | 94 | - 默认单位字节 95 | - -s : 显示所有文件大小总和, 96 | - -h : 将以更友好的方式显示文件大小(例如 64.0m 而不是 67108864) 97 | 98 | ``` 99 | hdfs dfs -du 100 | ``` 101 | 102 | ### 合并下载多个文件 103 | 104 | - -nl 在每个文件的末尾添加换行符(LF) 105 | - -skip-empty-file 跳过空文件 106 | 107 | ``` 108 | hdfs dfs -getmerge 109 | # 示例 将HDFS上的hbase-policy.xml和hbase-site.xml文件合并后下载到本地的/usr/test.xml 110 | hdfs dfs -getmerge -nl /test/hbase-policy.xml /test/hbase-site.xml /usr/test.xml 111 | ``` 112 | 113 | ### 统计文件系统的可用空间信息 114 | 115 | ``` 116 | hdfs dfs -df -h / 117 | ``` 118 | 119 | ### 更改文件复制因子 120 | 121 | ``` 122 | hdfs dfs -setrep [-R] [-w] 123 | ``` 124 | 125 | - 更改文件的复制因子。如果 path 是目录,则更改其下所有文件的复制因子 126 | - -w : 请求命令是否等待复制完成 127 | 128 | ``` 129 | # 示例 130 | hdfs dfs -setrep -w 3 /user/hadoop/dir1 131 | ``` 132 | 133 | ### 权限控制 134 | 135 | ``` 136 | # 权限控制和Linux上使用方式一致 137 | # 变更文件或目录的所属群组。 用户必须是文件的所有者或超级用户。 138 | hdfs dfs -chgrp [-R] GROUP URI [URI ...] 139 | # 修改文件或目录的访问权限 用户必须是文件的所有者或超级用户。 140 | hdfs dfs -chmod [-R] URI [URI ...] 141 | # 修改文件的拥有者 用户必须是超级用户。 142 | hdfs dfs -chown [-R] [OWNER][:[GROUP]] URI [URI ] 143 | ``` 144 | 145 | ### 文件检测 146 | 147 | ``` 148 | hdfs dfs -test - [defsz] URI 149 | ``` 150 | 151 | 可选选项: 152 | 153 | - -d:如果路径是目录,返回 0。 154 | - -e:如果路径存在,则返回 0。 155 | - -f:如果路径是文件,则返回 0。 156 | - -s:如果路径不为空,则返回 0。 157 | - -r:如果路径存在且授予读权限,则返回 0。 158 | - -w:如果路径存在且授予写入权限,则返回 0。 159 | - -z:如果文件长度为零,则返回 0。 160 | 161 | ``` 162 | # 示例 163 | hdfs dfs -test -e filename 164 | ``` 165 | 166 | ## HDFS 安全模式 167 | 168 | ### 什么是安全模式? 169 | 170 | - 安全模式是 HDFS 的一种特殊状态,在这种状态下,HDFS 只接收读数据请求,而不接收写入、删除、修改等变更请求。 171 | - 安全模式是 HDFS 确保 Block 数据安全的一种保护机制。 172 | - Active NameNode 启动时,HDFS 会进入安全模式,DataNode 主动向 NameNode 汇报可用 Block 列表等信息,在系统达到安全标准前,HDFS 一直处于“只读”状态。 173 | 174 | ### 何时正常离开安全模式 175 | 176 | - Block 上报率:DataNode 上报的可用 Block 个数 / NameNode 元数据记录的 Block 个数 177 | - 当 Block 上报率 >= 阈值时,HDFS 才能离开安全模式,默认阈值为 0.999 178 | - 不建议手动强制退出安全模式 179 | 180 | ### 触发安全模式的原因 181 | 182 | - NameNode 重启 183 | - NameNode 磁盘空间不足 184 | - Block 上报率低于阈值 185 | - DataNode 无法正常启动 186 | - 日志中出现严重异常 187 | - 用户操作不当,如:**强制关机(特别注意!)** 188 | 189 | ### 故障排查 190 | 191 | - 找到 DataNode 不能正常启动的原因,重启 DataNode 192 | - 清理 NameNode 磁盘 193 | - 谨慎操作,有问题找星环,以免丢失数据 194 | 195 | ## 参考资料 196 | 197 | - [HDFS 官方文档](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) 198 | - [HDFS 知识点总结](https://www.cnblogs.com/caiyisen/p/7395843.html) 199 | -------------------------------------------------------------------------------- /docs/16.大数据/01.hadoop/01.hdfs/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: HDFS 教程 3 | date: 2022-02-21 20:26:47 4 | categories: 5 | - 大数据 6 | - hadoop 7 | - hdfs 8 | tags: 9 | - 大数据 10 | - Hadoop 11 | - HDFS 12 | permalink: /pages/8d798e/ 13 | hidden: true 14 | --- 15 | 16 | # HDFS 教程 17 | 18 | > **HDFS** 是 **Hadoop Distributed File System** 的缩写,即 Hadoop 的分布式文件系统。 19 | > 20 | > HDFS 是一种用于存储具有流数据访问模式的超大文件的文件系统,它运行在廉价的机器集群上。 21 | > 22 | > HDFS 的设计目标是管理数以千计的服务器、数以万计的磁盘,将这么大规模的服务器计算资源当作一个单一的存储系统进行管理,对应用程序提供数以 PB 计的存储容量,让应用程序像使用普通文件系统一样存储大规模的文件数据。 23 | > 24 | > HDFS 是在一个大规模分布式服务器集群上,对数据分片后进行并行读写及冗余存储。因为 HDFS 可以部署在一个比较大的服务器集群上,集群中所有服务器的磁盘都可供 HDFS 使用,所以整个 HDFS 的存储空间可以达到 PB 级容量。 25 | 26 | ## 📖 内容 27 | 28 | - [HDFS 入门](01.HDFS入门.md) 29 | - [HDFS 运维](02.HDFS运维.md) 30 | - [HDFS Java API](03.HDFSJavaApi.md) 31 | 32 | ## 📚 资料 33 | 34 | - **官方** 35 | - [HDFS 官方文档](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) 36 | - **书籍** 37 | - [《Hadoop 权威指南(第四版)》](https://item.jd.com/12109713.html) 38 | - **文章** 39 | - [翻译经典 HDFS 原理讲解漫画](https://blog.csdn.net/hudiefenmu/article/details/37655491) 40 | - [HDFS 知识点总结](https://www.cnblogs.com/caiyisen/p/7395843.html) 41 | 42 | ## 🚪 传送 43 | 44 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 45 | -------------------------------------------------------------------------------- /docs/16.大数据/01.hadoop/02.yarn.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: YARN 3 | date: 2019-05-07 20:19:25 4 | categories: 5 | - 大数据 6 | - hadoop 7 | tags: 8 | - 大数据 9 | - Hadoop 10 | - YARN 11 | permalink: /pages/406588/ 12 | --- 13 | 14 | # YARN 15 | 16 | > YARN 的目标是解决 MapReduce 的缺陷。 17 | 18 | ## MapReduce 的缺陷(Hadoop 1.x) 19 | 20 | - 身兼两职:计算框架 + 资源管理框架 21 | - JobTracker 22 | - 既做资源管理,又做任务调度 23 | - 任务太重,开销过大 24 | - 存在单点故障 25 | - 资源描述模型过于简单,资源利用率较低 26 | - 仅把 Task 数量看作资源,没有考虑 CPU 和内存 27 | - 强制把资源分成 Map Task Slot 和 Reduce Task Slot 28 | - 扩展性较差,集群规模上限 4K 29 | - 源码难于理解,升级维护困难 30 | 31 | ## YARN 简介 32 | 33 | YARN(Yet Another Resource Negotiator,另一种资源管理器)是一个**分布式通用资源管理系统**。 34 | 35 | 设计目标:聚焦资源管理、通用(适用各种计算框架)、高可用、高扩展。 36 | 37 | ## YARN 系统架构 38 | 39 | - 主从结构(master/slave) 40 | - 将 JobTracker 的资源管理、任务调度功能分离 41 | - 三种角色: 42 | - ResourceManager(Master) - 集群资源的统一管理和分配 43 | - NodeManager(Slave) - 管理节点资源,以及容器的生命周期 44 | - ApplicationMaster(新角色) - 管理应用程序实例,包括任务调度和资源申请 45 | 46 | ### ResourceManager(RM) 47 | 48 | **主要功能** 49 | 50 | - 统一管理集群的所有资源 51 | - 将资源按照一定策略分配给各个应用(ApplicationMaster) 52 | - 接收 NodeManager 的资源上报信息 53 | 54 | **核心组件** 55 | 56 | - 用户交互服务(User Service) 57 | - NodeManager 管理 58 | - ApplicationMaster 管理 59 | - Application 管理 60 | - 安全管理 61 | - 资源管理 62 | 63 | ### NodeManager(NM) 64 | 65 | **主要功能** 66 | 67 | - 管理单个节点的资源 68 | - 向 ResourceManager 汇报节点资源使用情况 69 | - 管理 Container 的生命周期 70 | 71 | **核心组件** 72 | 73 | - NodeStatusUpdater 74 | - ContainerManager 75 | - ContainerExecutor 76 | - NodeHealthCheckerService 77 | - Security 78 | - WebServer 79 | 80 | ### ApplicationMaster(AM) 81 | 82 | **主要功能** 83 | 84 | - 管理应用程序实例 85 | - 向 ResourceManager 申请任务执行所需的资源 86 | - 任务调度和监管 87 | 88 | **实现方式** 89 | 90 | - 需要为每个应用开发一个 AM 组件 91 | - YARN 提供 MapReduce 的 ApplicationMaster 实现 92 | - 采用基于事件驱动的异步编程模型,由中央事件调度器统一管理所有事件 93 | - 每种组件都是一种事件处理器,在中央事件调度器中注册 94 | 95 | ### Container 96 | 97 | - 概念:Container 封装了节点上进程的相关资源,是 YARN 中资源的抽象 98 | - 分类:运行 ApplicationMaster 的 Container 、运行应用任务的 Container 99 | 100 | ## YARN 高可用 101 | 102 | ResourceManager 高可用 103 | 104 | - 1 个 Active RM、多个 Standby RM 105 | - 宕机后自动实现主备切换 106 | - ZooKeeper 的核心作用 107 | - Active 节点选举 108 | - 恢复 Active RM 的原有状态信息 109 | - 重启 AM,杀死所有运行中的 Container 110 | - 切换方式:手动、自动 111 | 112 | ## YARN 资源调度策略 113 | 114 | ### FIFO Scheduler(先进先出调度器) 115 | 116 | **调度策略** 117 | 118 | 将所有任务放入一个队列,先进队列的先获得资源,排在后面的任务只有等待 119 | 120 | **缺点** 121 | 122 | - 资源利用率低,无法交叉运行任务 123 | - 灵活性差,如:紧急任务无法插队,耗时长的任务拖慢耗时短的任务 124 | 125 | ### Capacity Scheduler(容量调度器) 126 | 127 | **核心思想** - 提前**做预算**,在预算指导下分享集群资源。 128 | 129 | **调度策略** 130 | 131 | - 集群资源由多个队列分享 132 | - 每个队列都要预设资源分配的比例(提前做预算) 133 | - 空闲资源优先分配给“实际资源/预算资源”比值最低的队列 134 | - 队列内部采用 FIFO 调度策略 135 | 136 | **特点** 137 | 138 | - 层次化的队列设计:子队列可使用父队列资源 139 | - 容量保证:每个队列都要预设资源占比,防止资源独占 140 | - 弹性分配:空闲资源可以分配给任何队列,当多个队列争用时,会按比例进行平衡 141 | - 支持动态管理:可以动态调整队列的容量、权限等参数,也可动态增加、暂停队列 142 | - 访问控制:用户只能向自己的队列中提交任务,不能访问其他队列 143 | - 多租户:多用户共享集群资源 144 | 145 | ### Fair Scheduler(公平调度器) 146 | 147 | **调度策略** 148 | 149 | - 多队列公平共享集群资源 150 | - 通过平分的方式,动态分配资源,无需预先设定资源分配比例 151 | - 队列内部可配置调度策略:FIFO、Fair(默认) 152 | 153 | **资源抢占** 154 | 155 | - 终止其他队列的任务,使其让出所占资源,然后将资源分配给占用资源量少于最小资源量限制的队列 156 | 157 | **队列权重** 158 | 159 | - 当队列中有任务等待,并且集群中有空闲资源时,每个队列可以根据权重获得不同比例的空闲资源 160 | 161 | ## 资源 162 | -------------------------------------------------------------------------------- /docs/16.大数据/01.hadoop/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Hadoop 教程 3 | date: 2020-09-09 17:53:08 4 | categories: 5 | - 大数据 6 | - hadoop 7 | tags: 8 | - 大数据 9 | - Hadoop 10 | permalink: /pages/680e30/ 11 | hidden: true 12 | --- 13 | 14 | # Hadoop 教程 15 | 16 | ## 📖 内容 17 | 18 | ### HDFS 19 | 20 | - [HDFS 入门](01.hdfs/01.HDFS入门.md) 21 | - [HDFS 运维](01.hdfs/02.HDFS运维.md) 22 | - [HDFS Java API](01.hdfs/03.HDFSJavaApi.md) 23 | 24 | ### YARN 25 | 26 | ### MapReduce 27 | 28 | ## 📚 资料 29 | 30 | ## 🚪 传送 31 | 32 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 33 | -------------------------------------------------------------------------------- /docs/16.大数据/02.hive/02.Hive表.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Hive 分区表和分桶表 3 | date: 2020-02-24 21:14:47 4 | categories: 5 | - 大数据 6 | - hive 7 | tags: 8 | - 大数据 9 | - Hive 10 | permalink: /pages/18eb58/ 11 | --- 12 | 13 | # Hive 分区表和分桶表 14 | 15 | ## 分区表 16 | 17 | ### 概念 18 | 19 | Hive 中的表对应为 HDFS 上的指定目录,在查询数据时候,默认会对全表进行扫描,这样时间和性能的消耗都非常大。 20 | 21 | **分区为 HDFS 上表目录的子目录**,数据按照分区存储在子目录中。如果查询的 `where` 子句中包含分区条件,则直接从该分区去查找,而不是扫描整个表目录,合理的分区设计可以极大提高查询速度和性能。 22 | 23 | > 这里说明一下分区表并非 Hive 独有的概念,实际上这个概念非常常见。比如在我们常用的 Oracle 数据库中,当表中的数据量不断增大,查询数据的速度就会下降,这时也可以对表进行分区。表进行分区后,逻辑上表仍然是一张完整的表,只是将表中的数据存放到多个表空间(物理文件上),这样查询数据时,就不必要每次都扫描整张表,从而提升查询性能。 24 | 25 | ### 使用场景 26 | 27 | 通常,在管理大规模数据集的时候都需要进行分区,比如将日志文件按天进行分区,从而保证数据细粒度的划分,使得查询性能得到提升。 28 | 29 | ### 创建分区表 30 | 31 | 在 Hive 中可以使用 `PARTITIONED BY` 子句创建分区表。表可以包含一个或多个分区列,程序会为分区列中的每个不同值组合创建单独的数据目录。下面的我们创建一张雇员表作为测试: 32 | 33 | ```sql 34 | CREATE EXTERNAL TABLE emp_partition( 35 | empno INT, 36 | ename STRING, 37 | job STRING, 38 | mgr INT, 39 | hiredate TIMESTAMP, 40 | sal DECIMAL(7,2), 41 | comm DECIMAL(7,2) 42 | ) 43 | PARTITIONED BY (deptno INT) -- 按照部门编号进行分区 44 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t" 45 | LOCATION '/hive/emp_partition'; 46 | ``` 47 | 48 | ### 加载数据到分区表 49 | 50 | 加载数据到分区表时候必须要指定数据所处的分区: 51 | 52 | ```shell 53 | # 加载部门编号为20的数据到表中 54 | LOAD DATA LOCAL INPATH "/usr/file/emp20.txt" OVERWRITE INTO TABLE emp_partition PARTITION (deptno=20) 55 | # 加载部门编号为30的数据到表中 56 | LOAD DATA LOCAL INPATH "/usr/file/emp30.txt" OVERWRITE INTO TABLE emp_partition PARTITION (deptno=30) 57 | ``` 58 | 59 | ### 查看分区目录 60 | 61 | 这时候我们直接查看表目录,可以看到表目录下存在两个子目录,分别是 `deptno=20` 和 `deptno=30`,这就是分区目录,分区目录下才是我们加载的数据文件。 62 | 63 | ```shell 64 | # hadoop fs -ls hdfs://hadoop001:8020/hive/emp_partition/ 65 | ``` 66 | 67 | 这时候当你的查询语句的 `where` 包含 `deptno=20`,则就去对应的分区目录下进行查找,而不用扫描全表。 68 | 69 | ![img](https://github.com/heibaiying/BigData-Notes/raw/master/pictures/hive-hadoop-partitation.png) 70 | 71 | ## 分桶表 72 | 73 | ### 简介 74 | 75 | 分区提供了一个隔离数据和优化查询的可行方案,但是并非所有的数据集都可以形成合理的分区,分区的数量也不是越多越好,过多的分区条件可能会导致很多分区上没有数据。同时 Hive 会限制动态分区可以创建的最大分区数,用来避免过多分区文件对文件系统产生负担。鉴于以上原因,Hive 还提供了一种更加细粒度的数据拆分方案:分桶表 (bucket Table)。 76 | 77 | 分桶表会将指定列的值进行哈希散列,并对 bucket(桶数量)取余,然后存储到对应的 bucket(桶)中。 78 | 79 | ### 理解分桶表 80 | 81 | 单从概念上理解分桶表可能会比较晦涩,其实和分区一样,分桶这个概念同样不是 Hive 独有的,对于 Java 开发人员而言,这可能是一个每天都会用到的概念,因为 Hive 中的分桶概念和 Java 数据结构中的 HashMap 的分桶概念是一致的。 82 | 83 | 当调用 HashMap 的 put() 方法存储数据时,程序会先对 key 值调用 hashCode() 方法计算出 hashcode,然后对数组长度取模计算出 index,最后将数据存储在数组 index 位置的链表上,链表达到一定阈值后会转换为红黑树 (JDK1.8+)。下图为 HashMap 的数据结构图: 84 | 85 | ![img](https://raw.githubusercontent.com/dunwu/images/master/snap/20200224194352.png) 86 | 87 | 图片引用自:[HashMap vs. Hashtable](http://www.itcuties.com/java/hashmap-hashtable/) 88 | 89 | ### 创建分桶表 90 | 91 | 在 Hive 中,我们可以通过 `CLUSTERED BY` 指定分桶列,并通过 `SORTED BY` 指定桶中数据的排序参考列。下面为分桶表建表语句示例: 92 | 93 | ```sql 94 | CREATE EXTERNAL TABLE emp_bucket( 95 | empno INT, 96 | ename STRING, 97 | job STRING, 98 | mgr INT, 99 | hiredate TIMESTAMP, 100 | sal DECIMAL(7,2), 101 | comm DECIMAL(7,2), 102 | deptno INT) 103 | CLUSTERED BY(empno) SORTED BY(empno ASC) INTO 4 BUCKETS --按照员工编号散列到四个 bucket 中 104 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t" 105 | LOCATION '/hive/emp_bucket'; 106 | ``` 107 | 108 | ### 加载数据到分桶表 109 | 110 | 这里直接使用 `Load` 语句向分桶表加载数据,数据时可以加载成功的,但是数据并不会分桶。 111 | 112 | 这是由于分桶的实质是对指定字段做了 hash 散列然后存放到对应文件中,这意味着向分桶表中插入数据是必然要通过 MapReduce,且 Reducer 的数量必须等于分桶的数量。由于以上原因,分桶表的数据通常只能使用 CTAS(CREATE TABLE AS SELECT) 方式插入,因为 CTAS 操作会触发 MapReduce。加载数据步骤如下: 113 | 114 | #### 设置强制分桶 115 | 116 | ```sql 117 | set hive.enforce.bucketing = true; --Hive 2.x 不需要这一步 118 | ``` 119 | 120 | 在 Hive 0.x and 1.x 版本,必须使用设置 `hive.enforce.bucketing = true`,表示强制分桶,允许程序根据表结构自动选择正确数量的 Reducer 和 cluster by column 来进行分桶。 121 | 122 | #### CTAS 导入数据 123 | 124 | ```sql 125 | INSERT INTO TABLE emp_bucket SELECT * FROM emp; --这里的 emp 表就是一张普通的雇员表 126 | ``` 127 | 128 | 可以从执行日志看到 CTAS 触发 MapReduce 操作,且 Reducer 数量和建表时候指定 bucket 数量一致: 129 | 130 | ![img](https://github.com/heibaiying/BigData-Notes/raw/master/pictures/hive-hadoop-mapreducer.png) 131 | 132 | ### 查看分桶文件 133 | 134 | bucket(桶) 本质上就是表目录下的具体文件: 135 | 136 | ![img](https://github.com/heibaiying/BigData-Notes/raw/master/pictures/hive-hadoop-bucket.png) 137 | 138 | ## 分区表和分桶表结合使用 139 | 140 | 分区表和分桶表的本质都是将数据按照不同粒度进行拆分,从而使得在查询时候不必扫描全表,只需要扫描对应的分区或分桶,从而提升查询效率。两者可以结合起来使用,从而保证表数据在不同粒度上都能得到合理的拆分。下面是 Hive 官方给出的示例: 141 | 142 | ```sql 143 | CREATE TABLE page_view_bucketed( 144 | viewTime INT, 145 | userid BIGINT, 146 | page_url STRING, 147 | referrer_url STRING, 148 | ip STRING ) 149 | PARTITIONED BY(dt STRING) 150 | CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS 151 | ROW FORMAT DELIMITED 152 | FIELDS TERMINATED BY '\001' 153 | COLLECTION ITEMS TERMINATED BY '\002' 154 | MAP KEYS TERMINATED BY '\003' 155 | STORED AS SEQUENCEFILE; 156 | ``` 157 | 158 | 此时导入数据时需要指定分区: 159 | 160 | ```shell 161 | INSERT OVERWRITE page_view_bucketed 162 | PARTITION (dt='2009-02-25') 163 | SELECT * FROM page_view WHERE dt='2009-02-25'; 164 | ``` 165 | 166 | ## 参考资料 167 | 168 | - [LanguageManual DDL BucketedTables](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL+BucketedTables) 169 | -------------------------------------------------------------------------------- /docs/16.大数据/02.hive/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Hive 教程 3 | date: 2020-09-09 17:53:08 4 | categories: 5 | - 大数据 6 | - hive 7 | tags: 8 | - 大数据 9 | - Hive 10 | permalink: /pages/a958fe/ 11 | hidden: true 12 | --- 13 | 14 | # Hive 教程 15 | 16 | ## 📖 内容 17 | 18 | - [Hive 入门](01.Hive入门.md) 19 | - [Hive 表](02.Hive表.md) 20 | - [Hive 视图和索引](03.Hive视图和索引.md) 21 | - [Hive 查询](04.Hive查询.md) 22 | - [Hive DDL](05.HiveDDL.md) 23 | - [Hive DML](06.HiveDML.md) 24 | - [Hive 运维](07.Hive运维.md) 25 | 26 | ## 📚 资料 27 | 28 | ## 🚪 传送 29 | 30 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 31 | -------------------------------------------------------------------------------- /docs/16.大数据/03.hbase/02.HBase命令.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: HBase 命令 3 | date: 2020-06-02 22:28:18 4 | categories: 5 | - 大数据 6 | - hbase 7 | tags: 8 | - 大数据 9 | - HBase 10 | permalink: /pages/263c40/ 11 | --- 12 | 13 | # HBase 命令 14 | 15 | > 进入 HBase Shell 控制台:`./bin/hbase shell` 16 | > 17 | > 如果有 kerberos 认证,需要事先使用相应的 keytab 进行一下认证(使用 kinit 命令),认证成功之后再使用 hbase shell 进入可以使用 whoami 命令可查看当前用户. 18 | 19 | ## 基本命令 20 | 21 | - 获取帮助信息:`help` 22 | - 获取命令的详细帮助信息:`help 'status'` 23 | - 查看服务器状态:`status` 24 | - 查看版本信息:`version` 25 | - 查看当前登录用户:`whoami` 26 | 27 | ## DDL 28 | 29 | ### 创建表 30 | 31 | 【语法】`create '表名称','列族名称 1','列族名称 2','列名称 N'` 32 | 33 | 【示例】 34 | 35 | ```shell 36 | # 创建一张名为 test 的表,columnFamliy1、columnFamliy2 是 table1 表的列族。 37 | create 'test','columnFamliy1','columnFamliy2' 38 | ``` 39 | 40 | ### 启用、禁用表 41 | 42 | - 启用表:`enable 'test'` 43 | - 禁用表:`disable 'test'` 44 | - 检查表是否被启用:`is_enabled 'test'` 45 | - 检查表是否被禁用:`is_disabled 'test'` 46 | 47 | ### 删除表 48 | 49 | 注意:删除表前需要先禁用表 50 | 51 | ```shell 52 | disable 'test' 53 | drop 'test' 54 | ``` 55 | 56 | ### 修改表 57 | 58 | #### 添加列族 59 | 60 | **命令格式**: alter '表名', '列族名' 61 | 62 | ```shell 63 | alter 'test', 'teacherInfo' 64 | ``` 65 | 66 | #### 删除列族 67 | 68 | **命令格式**:alter '表名', {NAME => '列族名', METHOD => 'delete'} 69 | 70 | ```shell 71 | alter 'test', {NAME => 'teacherInfo', METHOD => 'delete'} 72 | ``` 73 | 74 | #### 更改列族存储版本的限制 75 | 76 | 默认情况下,列族只存储一个版本的数据,如果需要存储多个版本的数据,则需要修改列族的属性。修改后可通过 `desc` 命令查看。 77 | 78 | ```shell 79 | alter 'test',{NAME=>'columnFamliy1',VERSIONS=>3} 80 | ``` 81 | 82 | ### 查看表 83 | 84 | - 查看所有表:`list` 85 | - 查看表的详细信息:`describe 'test'` 86 | - 检查表是否存在:`exists 'test'` 87 | 88 | ## 增删改 89 | 90 | ### 插入数据 91 | 92 | **命令格式**:`put '表名', '行键','列族:列','值'` 93 | 94 | **注意:如果新增数据的行键值、列族名、列名与原有数据完全相同,则相当于更新操作** 95 | 96 | ```shell 97 | put 'test', 'rowkey1', 'columnFamliy1:a', 'valueA' 98 | put 'test', 'rowkey1', 'columnFamliy1:b', 'valueB' 99 | put 'test', 'rowkey1', 'columnFamliy1:c', 'valueC' 100 | 101 | put 'test', 'rowkey2', 'columnFamliy1:a', 'valueA' 102 | put 'test', 'rowkey2', 'columnFamliy1:b', 'valueB' 103 | put 'test', 'rowkey2', 'columnFamliy1:c', 'valueC' 104 | 105 | put 'test', 'rowkey3', 'columnFamliy1:a', 'valueA' 106 | put 'test', 'rowkey3', 'columnFamliy1:b', 'valueB' 107 | put 'test', 'rowkey3', 'columnFamliy1:c', 'valueC' 108 | 109 | put 'test', 'rowkey1', 'columnFamliy2:a', 'valueA' 110 | put 'test', 'rowkey1', 'columnFamliy2:b', 'valueB' 111 | put 'test', 'rowkey1', 'columnFamliy2:c', 'valueC' 112 | ``` 113 | 114 | ### 获取指定行、列族、列 115 | 116 | - 获取指定行中所有列的数据信息:`get 'test','rowkey2'` 117 | - 获取指定行中指定列族下所有列的数据信息:`get 'test','rowkey2','columnFamliy1'` 118 | - 获取指定行中指定列的数据信息:`get 'test','rowkey2','columnFamliy1:a'` 119 | 120 | ### 删除指定行、列 121 | 122 | - 删除指定行:`delete 'test','rowkey2'` 123 | - 删除指定行中指定列的数据:`delete 'test','rowkey2','columnFamliy1:a'` 124 | 125 | ## 查询 126 | 127 | hbase 中访问数据有两种基本的方式: 128 | 129 | - 按指定 rowkey 获取数据:`get` 方法; 130 | - 按指定条件获取数据:`scan` 方法。 131 | 132 | `scan` 可以设置 begin 和 end 参数来访问一个范围内所有的数据。get 本质上就是 begin 和 end 相等的一种特殊的 scan。 133 | 134 | ### get 查询 135 | 136 | - 获取指定行中所有列的数据信息:`get 'test','rowkey2'` 137 | - 获取指定行中指定列族下所有列的数据信息:`get 'test','rowkey2','columnFamliy1'` 138 | - 获取指定行中指定列的数据信息:`get 'test','rowkey2','columnFamliy1:a'` 139 | 140 | ### scan 查询 141 | 142 | #### 查询整表数据 143 | 144 | ```shell 145 | scan 'test' 146 | ``` 147 | 148 | #### 查询指定列簇的数据 149 | 150 | ```shell 151 | scan 'test', {COLUMN=>'columnFamliy1'} 152 | ``` 153 | 154 | #### 条件查询 155 | 156 | ```shell 157 | # 查询指定列的数据 158 | scan 'test', {COLUMNS=> 'columnFamliy1:a'} 159 | ``` 160 | 161 | 除了列 `(COLUMNS)` 修饰词外,HBase 还支持 `Limit`(限制查询结果行数),`STARTROW`(`ROWKEY` 起始行,会先根据这个 `key` 定位到 `region`,再向后扫描)、`STOPROW`(结束行)、`TIMERANGE`(限定时间戳范围)、`VERSIONS`(版本数)、和 `FILTER`(按条件过滤行)等。 162 | 163 | 如下代表从 `rowkey2` 这个 `rowkey` 开始,查找下两个行的最新 3 个版本的 name 列的数据: 164 | 165 | ```shell 166 | scan 'test', {COLUMNS=> 'columnFamliy1:a',STARTROW => 'rowkey2',STOPROW => 'rowkey3',LIMIT=>2, VERSIONS=>3} 167 | ``` 168 | 169 | #### 条件过滤 170 | 171 | Filter 可以设定一系列条件来进行过滤。如我们要查询值等于 24 的所有数据: 172 | 173 | ```shell 174 | scan 'test', FILTER=>"ValueFilter(=,'binary:24')" 175 | ``` 176 | 177 | 值包含 valueA 的所有数据: 178 | 179 | ```shell 180 | scan 'test', FILTER=>"ValueFilter(=,'substring:valueA')" 181 | ``` 182 | 183 | 列名中的前缀为 b 的: 184 | 185 | ```shell 186 | scan 'test', FILTER=>"ColumnPrefixFilter('b')" 187 | ``` 188 | 189 | FILTER 中支持多个过滤条件通过括号、AND 和 OR 进行组合: 190 | 191 | ```shell 192 | # 列名中的前缀为 b 且列值中包含1998的数据 193 | scan 'test', FILTER=>"ColumnPrefixFilter('b') AND ValueFilter ValueFilter(=,'substring:A')" 194 | ``` 195 | 196 | `PrefixFilter` 用于对 Rowkey 的前缀进行判断: 197 | 198 | ```shell 199 | scan 'test', FILTER=>"PrefixFilter('wr')" 200 | ``` 201 | 202 | ## 参考资料 203 | 204 | - [Hbase 常用 Shell 命令](https://github.com/heibaiying/BigData-Notes/blob/master/notes/Hbase_Shell.md) 205 | -------------------------------------------------------------------------------- /docs/16.大数据/03.hbase/03.HBase运维.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: HBase 运维 3 | date: 2019-05-07 20:19:25 4 | categories: 5 | - 大数据 6 | - hbase 7 | tags: 8 | - 大数据 9 | - HBase 10 | - 运维 11 | permalink: /pages/f808fc/ 12 | --- 13 | 14 | # HBase 运维 15 | 16 | ## 配置文件 17 | 18 | - `backup-masters` - 默认情况下不存在。列出主服务器应在其上启动备份主进程的主机,每行一个主机。 19 | - `hadoop-metrics2-hbase.properties` - 用于连接 HBase Hadoop 的 Metrics2 框架。 20 | - `hbase-env.cmd` and hbase-env.sh - 用于 Windows 和 Linux / Unix 环境的脚本,用于设置 HBase 的工作环境,包括 Java,Java 选项和其他环境变量的位置。 21 | - `hbase-policy.xml` - RPC 服务器用于对客户端请求进行授权决策的默认策略配置文件。仅在启用 HBase 安全性时使用。 22 | - `hbase-site.xml` - 主要的 HBase 配置文件。此文件指定覆盖 HBase 默认配置的配置选项。您可以在 docs / hbase-default.xml 中查看(但不要编辑)默认配置文件。您还可以在 HBase Web UI 的 HBase 配置选项卡中查看群集的整个有效配置(默认值和覆盖)。 23 | - `log4j.properties` - log4j 日志配置。 24 | - `regionservers` - 包含应在 HBase 集群中运行 RegionServer 的主机列表。默认情况下,此文件包含单个条目 localhost。它应包含主机名或 IP 地址列表,每行一个,并且如果群集中的每个节点将在其 localhost 接口上运行 RegionServer,则应仅包含 localhost。 25 | 26 | ## 环境要求 27 | 28 | - Java 29 | - HBase 2.0+ 要求 JDK8+ 30 | - HBase 1.2+ 要求 JDK7+ 31 | - SSH - 环境要支持 SSH 32 | - DNS - 环境中要在 hosts 配置本机 hostname 和本机 IP 33 | - NTP - HBase 集群的时间要同步,可以配置统一的 NTP 34 | - 平台 - 生产环境不推荐部署在 Windows 系统中 35 | - Hadoop - 依赖 Hadoop 配套版本 36 | - Zookeeper - 依赖 Zookeeper 配套版本 37 | 38 | ## 运行模式 39 | 40 | ### 单点 41 | 42 | hbase-site.xml 配置如下: 43 | 44 | ```xml 45 | 46 | 47 | hbase.rootdir 48 | hdfs://namenode.example.org:8020/hbase 49 | 50 | 51 | hbase.cluster.distributed 52 | false 53 | 54 | 55 | ``` 56 | 57 | ### 分布式 58 | 59 | hbase-site.xm 配置如下: 60 | 61 | ```xml 62 | 63 | 64 | hbase.rootdir 65 | hdfs://namenode.example.org:8020/hbase 66 | 67 | 68 | hbase.cluster.distributed 69 | true 70 | 71 | 72 | hbase.zookeeper.quorum 73 | node-a.example.com,node-b.example.com,node-c.example.com 74 | 75 | 76 | ``` 77 | 78 | ## 引用和引申 79 | 80 | ### 扩展阅读 81 | 82 | - [Apache HBase Configuration](http://hbase.apache.org/book.html#configuration) 83 | -------------------------------------------------------------------------------- /docs/16.大数据/03.hbase/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: HBase 教程 3 | date: 2020-09-09 17:53:08 4 | categories: 5 | - 大数据 6 | - hbase 7 | tags: 8 | - 大数据 9 | - HBase 10 | permalink: /pages/417be6/ 11 | hidden: true 12 | --- 13 | 14 | # HBase 教程 15 | 16 | ## 📖 内容 17 | 18 | - [HBase 原理](01.HBase原理.md) 19 | - [HBase 命令](02.HBase命令.md) 20 | - [HBase 运维](03.HBase运维.md) 21 | 22 | ## 📚 资料 23 | 24 | - **官方** 25 | - [HBase 官网](http://hbase.apache.org/) 26 | - [HBase 官方文档](https://hbase.apache.org/book.html) 27 | - [HBase 官方文档中文版](http://abloz.com/hbase/book.html) 28 | - [HBase API](https://hbase.apache.org/apidocs/index.html) 29 | - **教程** 30 | - [BigData-Notes](https://github.com/heibaiying/BigData-Notes) 31 | - **书籍** 32 | - [《Hadoop 权威指南(第四版)》](https://item.jd.com/12109713.html) 33 | - **文章** 34 | - [Bigtable: A Distributed Storage System for Structured Data](https://static.googleusercontent.com/media/research.google.com/zh-CN//archive/bigtable-osdi06.pdf) 35 | - [Intro to HBase](https://www.slideshare.net/alexbaranau/intro-to-hbase) 36 | 37 | ## 🚪 传送 38 | 39 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 40 | -------------------------------------------------------------------------------- /docs/16.大数据/04.zookeeper/03.ZooKeeper运维.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ZooKeeper运维 3 | date: 2020-06-02 22:28:38 4 | categories: 5 | - 大数据 6 | - zookeeper 7 | tags: 8 | - 分布式 9 | - 大数据 10 | - ZooKeeper 11 | permalink: /pages/bb5e61/ 12 | --- 13 | 14 | # ZooKeeper 运维指南 15 | 16 | ## 单点服务部署 17 | 18 | 在安装 ZooKeeper 之前,请确保你的系统是在以下任一操作系统上运行: 19 | 20 | - **任意 Linux OS** - 支持开发和部署。适合演示应用程序。 21 | - **Windows OS** - 仅支持开发。 22 | - **Mac OS** - 仅支持开发。 23 | 24 | 安装步骤如下: 25 | 26 | ### 下载解压 27 | 28 | 进入官方下载地址:[http://zookeeper.apache.org/releases.html#download](http://zookeeper.apache.org/releases.html#download) ,选择合适版本。 29 | 30 | 解压到本地: 31 | 32 | ```bash 33 | tar -zxf zookeeper-3.4.6.tar.gz 34 | cd zookeeper-3.4.6 35 | ``` 36 | 37 | ### 环境变量 38 | 39 | 执行 `vim /etc/profile`,添加环境变量: 40 | 41 | ```bash 42 | export ZOOKEEPER_HOME=/usr/app/zookeeper-3.4.14 43 | export PATH=$ZOOKEEPER_HOME/bin:$PATH 44 | ``` 45 | 46 | 再执行 `source /etc/profile` , 使得配置的环境变量生效。 47 | 48 | ### 修改配置 49 | 50 | 你必须创建 `conf/zoo.cfg` 文件,否则启动时会提示你没有此文件。 51 | 52 | 初次尝试,不妨直接使用 Kafka 提供的模板配置文件 `conf/zoo_sample.cfg`: 53 | 54 | ```bash 55 | cp conf/zoo_sample.cfg conf/zoo.cfg 56 | ``` 57 | 58 | 修改后完整配置如下: 59 | 60 | ```properties 61 | # The number of milliseconds of each tick 62 | tickTime=2000 63 | # The number of ticks that the initial 64 | # synchronization phase can take 65 | initLimit=10 66 | # The number of ticks that can pass between 67 | # sending a request and getting an acknowledgement 68 | syncLimit=5 69 | # the directory where the snapshot is stored. 70 | # do not use /tmp for storage, /tmp here is just 71 | # example sakes. 72 | dataDir=/usr/local/zookeeper/data 73 | dataLogDir=/usr/local/zookeeper/log 74 | # the port at which the clients will connect 75 | clientPort=2181 76 | # the maximum number of client connections. 77 | # increase this if you need to handle more clients 78 | #maxClientCnxns=60 79 | # 80 | # Be sure to read the maintenance section of the 81 | # administrator guide before turning on autopurge. 82 | # 83 | # http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance 84 | # 85 | # The number of snapshots to retain in dataDir 86 | #autopurge.snapRetainCount=3 87 | # Purge task interval in hours 88 | # Set to "0" to disable auto purge feature 89 | #autopurge.purgeInterval=1 90 | ``` 91 | 92 | 配置参数说明: 93 | 94 | - **tickTime**:用于计算的基础时间单元。比如 session 超时:N\*tickTime; 95 | - **initLimit**:用于集群,允许从节点连接并同步到 master 节点的初始化连接时间,以 tickTime 的倍数来表示; 96 | - **syncLimit**:用于集群, master 主节点与从节点之间发送消息,请求和应答时间长度(心跳机制); 97 | - **dataDir**:数据存储位置; 98 | - **dataLogDir**:日志目录; 99 | - **clientPort**:用于客户端连接的端口,默认 2181 100 | 101 | ### 启动服务 102 | 103 | 执行以下命令 104 | 105 | ```bash 106 | bin/zkServer.sh start 107 | ``` 108 | 109 | 执行此命令后,你将收到以下响应 110 | 111 | ```bash 112 | JMX enabled by default 113 | Using config: /Users/../zookeeper-3.4.6/bin/../conf/zoo.cfg 114 | Starting zookeeper ... STARTED 115 | ``` 116 | 117 | ### 停止服务 118 | 119 | 可以使用以下命令停止 zookeeper 服务器。 120 | 121 | ```bash 122 | bin/zkServer.sh stop 123 | ``` 124 | 125 | ## 集群服务部署 126 | 127 | 分布式系统节点数一般都要求是奇数,且最少为 3 个节点,Zookeeper 也不例外。 128 | 129 | 这里,规划一个含 3 个节点的最小 ZooKeeper 集群,主机名分别为 hadoop001,hadoop002,hadoop003 。 130 | 131 | ### 修改配置 132 | 133 | 修改配置文件 `zoo.cfg`,内容如下: 134 | 135 | ```properties 136 | tickTime=2000 137 | initLimit=10 138 | syncLimit=5 139 | dataDir=/usr/local/zookeeper-cluster/data/ 140 | dataLogDir=/usr/local/zookeeper-cluster/log/ 141 | clientPort=2181 142 | 143 | # server.1 这个1是服务器的标识,可以是任意有效数字,标识这是第几个服务器节点,这个标识要写到dataDir目录下面myid文件里 144 | # 指名集群间通讯端口和选举端口 145 | server.1=hadoop001:2287:3387 146 | server.2=hadoop002:2287:3387 147 | server.3=hadoop003:2287:3387 148 | ``` 149 | 150 | ### 标识节点 151 | 152 | 分别在三台主机的 `dataDir` 目录下新建 `myid` 文件,并写入对应的节点标识。Zookeeper 集群通过 `myid` 文件识别集群节点,并通过上文配置的节点通信端口和选举端口来进行节点通信,选举出 Leader 节点。 153 | 154 | 创建存储目录: 155 | 156 | ```bash 157 | # 三台主机均执行该命令 158 | mkdir -vp /usr/local/zookeeper-cluster/data/ 159 | ``` 160 | 161 | 创建并写入节点标识到 `myid` 文件: 162 | 163 | ```bash 164 | # hadoop001主机 165 | echo "1" > /usr/local/zookeeper-cluster/data/myid 166 | # hadoop002主机 167 | echo "2" > /usr/local/zookeeper-cluster/data/myid 168 | # hadoop003主机 169 | echo "3" > /usr/local/zookeeper-cluster/data/myid 170 | ``` 171 | 172 | ### 启动集群 173 | 174 | 分别在三台主机上,执行如下命令启动服务: 175 | 176 | ```bash 177 | /usr/app/zookeeper-cluster/zookeeper/bin/zkServer.sh start 178 | ``` 179 | 180 | ### 集群验证 181 | 182 | 启动后使用 `zkServer.sh status` 查看集群各个节点状态。 183 | 184 | ## 参考资料 185 | 186 | - [Zookeeper 安装](https://www.w3cschool.cn/zookeeper/zookeeper_installation.html) 187 | - [Zookeeper 单机环境和集群环境搭建](https://github.com/heibaiying/BigData-Notes/blob/master/notes/installation/Zookeeper%E5%8D%95%E6%9C%BA%E7%8E%AF%E5%A2%83%E5%92%8C%E9%9B%86%E7%BE%A4%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA.md) 188 | - [Zookeeper 客户端基础命令使用](https://www.runoob.com/w3cnote/zookeeper-bs-command.html) 189 | -------------------------------------------------------------------------------- /docs/16.大数据/04.zookeeper/05.ZooKeeperAcl.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ZooKeeperAcl 3 | date: 2022-02-19 13:27:21 4 | categories: 5 | - 大数据 6 | - zookeeper 7 | tags: 8 | - 分布式 9 | - 大数据 10 | - ZooKeeper 11 | - ACL 12 | permalink: /pages/4046ce/ 13 | --- 14 | 15 | # ZooKeeper ACL 16 | 17 | > 为了避免存储在 Zookeeper 上的数据被其他程序或者人为误修改,Zookeeper 提供了 ACL(Access Control Lists) 进行权限控制。 18 | > 19 | > ACL 权限可以针对节点设置相关读写等权限,保障数据安全性。 20 | 21 | ZooKeeper ACL 提供了以下几种命令行: 22 | 23 | - **getAcl 命令**:获取某个节点的 acl 权限信息。 24 | - **setAcl 命令**:设置某个节点的 acl 权限信息。 25 | - **addauth 命令**:输入认证授权信息,注册时输入明文密码,加密形式保存。 26 | 27 | ## ACL 组成 28 | 29 | Zookeeper 的 acl 通过 **`[scheme:id:permissions]`** 来构成权限列表。 30 | 31 | - **scheme**:代表采用的某种权限机制,包括 world、auth、digest、ip、super 几种。 32 | - **world**:默认模式,所有客户端都拥有指定的权限。world 下只有一个 id 选项,就是 anyone,通常组合写法为 `world:anyone:[permissons]`; 33 | - **auth**:只有经过认证的用户才拥有指定的权限。通常组合写法为 `auth:user:password:[permissons]`,使用这种模式时,你需要先进行登录,之后采用 auth 模式设置权限时,`user` 和 `password` 都将使用登录的用户名和密码; 34 | - **digest**:只有经过认证的用户才拥有指定的权限。通常组合写法为 `auth:user:BASE64(SHA1(password)):[permissons]`,这种形式下的密码必须通过 SHA1 和 BASE64 进行双重加密; 35 | - **ip**:限制只有特定 IP 的客户端才拥有指定的权限。通常组成写法为 `ip:182.168.0.168:[permissions]`; 36 | - **super**:代表超级管理员,拥有所有的权限,需要修改 Zookeeper 启动脚本进行配置。 37 | - **id**:代表允许访问的用户。 38 | - **permissions**:权限组合字符串,由 cdrwa 组成,其中每个字母代表支持不同权限。可选项如下: 39 | - **CREATE**:允许创建子节点; 40 | - **READ**:允许从节点获取数据并列出其子节点; 41 | - **WRITE**:允许为节点设置数据; 42 | - **DELETE**:允许删除子节点; 43 | - **ADMIN**:允许为节点设置权限。 44 | 45 | ## 设置与查看权限 46 | 47 | 想要给某个节点设置权限 (ACL),有以下两个可选的命令: 48 | 49 | ```bash 50 | # 1.给已有节点赋予权限 51 | setAcl path acl 52 | 53 | # 2.在创建节点时候指定权限 54 | create [-s] [-e] path data acl 55 | ``` 56 | 57 | 查看指定节点的权限命令如下: 58 | 59 | ```bash 60 | getAcl path 61 | ``` 62 | 63 | ## 添加认证信息 64 | 65 | 可以使用如下所示的命令为当前 Session 添加用户认证信息,等价于登录操作。 66 | 67 | ```bash 68 | # 格式 69 | addauth scheme auth 70 | 71 | #示例:添加用户名为test,密码为root的用户认证信息 72 | addauth digest test:root 73 | ``` 74 | 75 | ## 权限设置示例 76 | 77 | ### world 模式 78 | 79 | world 是一种默认的模式,即创建时如果不指定权限,则默认的权限就是 world。 80 | 81 | ```bash 82 | [zk: localhost:2181(CONNECTED) 32] create /mytest abc 83 | Created /mytest 84 | [zk: localhost:2181(CONNECTED) 4] getAcl /mytest 85 | 'world,'anyone # 默认的权限 86 | : cdrwa 87 | [zk: localhost:2181(CONNECTED) 34] setAcl /mytest world:anyone:cwda # 修改节点,不允许所有客户端读 88 | .... 89 | [zk: localhost:2181(CONNECTED) 6] get /mytest 90 | org.apache.zookeeper.KeeperException$NoAuthException: KeeperErrorCode = NoAuth for /mytest # 无权访问 91 | ``` 92 | 93 | ### auth 模式 94 | 95 | ```bash 96 | [zk: localhost:2181(CONNECTED) 36] addauth digest test:root # 登录 97 | [zk: localhost:2181(CONNECTED) 37] setAcl /mytest auth::cdrwa # 设置权限 98 | [zk: localhost:2181(CONNECTED) 38] getAcl /mytest # 查看权限信息 99 | 'digest,'heibai:sCxtVJ1gPG8UW/jzFHR0A1ZKY5s= # 用户名和密码 (密码经过加密处理),注意返回的权限类型是 digest 100 | : cdrwa 101 | 102 | # 用户名和密码都是使用登录的用户名和密码,即使你在创建权限时候进行指定也是无效的 103 | [zk: localhost:2181(CONNECTED) 39] setAcl /mytest auth:root:root:cdrwa #指定用户名和密码为 root 104 | [zk: localhost:2181(CONNECTED) 40] getAcl /mytest 105 | 'digest,'heibai:sCxtVJ1gPG8UW/jzFHR0A1ZKY5s= #无效,使用的用户名和密码依然还是 test 106 | : cdrwa 107 | ``` 108 | 109 | ### digest 模式 110 | 111 | ```bash 112 | [zk:44] create /spark "spark" digest:heibai:sCxtVJ1gPG8UW/jzFHR0A1ZKY5s=:cdrwa #指定用户名和加密后的密码 113 | [zk:45] getAcl /spark #获取权限 114 | 'digest,'heibai:sCxtVJ1gPG8UW/jzFHR0A1ZKY5s= # 返回的权限类型是 digest 115 | : cdrwa 116 | ``` 117 | 118 | 到这里你可以发现使用 `auth` 模式设置的权限和使用 `digest` 模式设置的权限,在最终结果上,得到的权限模式都是 `digest`。某种程度上,你可以把 `auth` 模式理解成是 `digest` 模式的一种简便实现。因为在 `digest` 模式下,每次设置都需要书写用户名和加密后的密码,这是比较繁琐的,采用 `auth` 模式就可以避免这种麻烦。 119 | 120 | ### ip 模式 121 | 122 | 限定只有特定的 ip 才能访问。 123 | 124 | ```bash 125 | [zk: localhost:2181(CONNECTED) 46] create /hive "hive" ip:192.168.0.108:cdrwa 126 | [zk: localhost:2181(CONNECTED) 47] get /hive 127 | Authentication is not valid : /hive # 当前主机已经不能访问 128 | ``` 129 | 130 | 这里可以看到当前主机已经不能访问,想要能够再次访问,可以使用对应 IP 的客户端,或使用下面介绍的 `super` 模式。 131 | 132 | ### super 模式 133 | 134 | 需要修改启动脚本 `zkServer.sh`,并在指定位置添加超级管理员账户和密码信息: 135 | 136 | ```bash 137 | "-Dzookeeper.DigestAuthenticationProvider.superDigest=heibai:sCxtVJ1gPG8UW/jzFHR0A1ZKY5s=" 138 | ``` 139 | 140 | 修改完成后需要使用 `zkServer.sh restart` 重启服务,此时再次访问限制 IP 的节点: 141 | 142 | ```bash 143 | [zk: localhost:2181(CONNECTED) 0] get /hive #访问受限 144 | Authentication is not valid : /hive 145 | [zk: localhost:2181(CONNECTED) 1] addauth digest heibai:heibai # 登录 (添加认证信息) 146 | [zk: localhost:2181(CONNECTED) 2] get /hive #成功访问 147 | hive 148 | cZxid = 0x158 149 | ctime = Sat May 25 09:11:29 CST 2019 150 | mZxid = 0x158 151 | mtime = Sat May 25 09:11:29 CST 2019 152 | pZxid = 0x158 153 | cversion = 0 154 | dataVersion = 0 155 | aclVersion = 0 156 | ephemeralOwner = 0x0 157 | dataLength = 4 158 | numChildren = 0 159 | ``` 160 | 161 | ## 参考资料 162 | 163 | - [Zookeeper 安装](https://www.w3cschool.cn/zookeeper/zookeeper_installation.html) 164 | - [Zookeeper 单机环境和集群环境搭建](https://github.com/heibaiying/BigData-Notes/blob/master/notes/installation/Zookeeper%E5%8D%95%E6%9C%BA%E7%8E%AF%E5%A2%83%E5%92%8C%E9%9B%86%E7%BE%A4%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA.md) 165 | - [Zookeeper 客户端基础命令使用](https://www.runoob.com/w3cnote/zookeeper-bs-command.html) 166 | -------------------------------------------------------------------------------- /docs/16.大数据/04.zookeeper/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ZooKeeper 教程 3 | date: 2020-09-09 17:53:08 4 | categories: 5 | - 大数据 6 | - zookeeper 7 | tags: 8 | - 分布式 9 | - 大数据 10 | - ZooKeeper 11 | permalink: /pages/1b41b6/ 12 | hidden: true 13 | --- 14 | 15 | # ZooKeeper 教程 16 | 17 | > ZooKeeper 是 Apache 的顶级项目。**ZooKeeper 为分布式应用提供了高效且可靠的分布式协调服务,提供了诸如统一命名服务、配置管理和分布式锁等分布式的基础服务。在解决分布式数据一致性方面,ZooKeeper 并没有直接采用 Paxos 算法,而是采用了名为 ZAB 的一致性协议**。 18 | > 19 | > ZooKeeper 主要用来解决分布式集群中应用系统的一致性问题,它能提供基于类似于文件系统的目录节点树方式的数据存储。但是 ZooKeeper 并不是用来专门存储数据的,它的作用主要是用来**维护和监控存储数据的状态变化。通过监控这些数据状态的变化,从而可以达到基于数据的集群管理**。 20 | > 21 | > 很多大名鼎鼎的框架都基于 ZooKeeper 来实现分布式高可用,如:Dubbo、Kafka 等。 22 | > 23 | > ZooKeeper 官方支持 Java 和 C 的 Client API。ZooKeeper 社区为大多数语言(.NET,python 等)提供非官方 API。 24 | 25 | ## 📖 内容 26 | 27 | ### [ZooKeeper 原理](01.ZooKeeper原理.md) 28 | 29 | ### [ZooKeeper 命令](02.ZooKeeper命令.md) 30 | 31 | ### [ZooKeeper 运维](03.ZooKeeper运维.md) 32 | 33 | ### [ZooKeeper Java API](04.ZooKeeperJavaApi.md) 34 | 35 | ### [ZooKeeper ACL](05.ZooKeeperAcl.md) 36 | 37 | ## 📚 资料 38 | 39 | - **官方** 40 | - [ZooKeeper 官网](http://zookeeper.apache.org/) 41 | - [ZooKeeper 官方文档](https://cwiki.apache.org/confluence/display/ZOOKEEPER) 42 | - [ZooKeeper Github](https://github.com/apache/zookeeper) 43 | - [Apache Curator 官网](http://curator.apache.org/) 44 | - **书籍** 45 | - [《Hadoop 权威指南(第四版)》](https://item.jd.com/12109713.html) 46 | - [《从 Paxos 到 Zookeeper 分布式一致性原理与实践》](https://item.jd.com/11622772.html) 47 | - **文章** 48 | - [分布式服务框架 ZooKeeper -- 管理分布式环境中的数据](https://www.ibm.com/developerworks/cn/opensource/os-cn-zookeeper/index.html) 49 | - [ZooKeeper 的功能以及工作原理](https://www.cnblogs.com/felixzh/p/5869212.html) 50 | - [ZooKeeper 简介及核心概念](https://github.com/heibaiying/BigData-Notes/blob/master/notes/ZooKeeper%E7%AE%80%E4%BB%8B%E5%8F%8A%E6%A0%B8%E5%BF%83%E6%A6%82%E5%BF%B5.md) 51 | - [详解分布式协调服务 ZooKeeper](https://draveness.me/zookeeper-chubby) 52 | - [深入浅出 Zookeeper(一) Zookeeper 架构及 FastLeaderElection 机制](http://www.jasongj.com/zookeeper/fastleaderelection/) 53 | - [Introduction to Apache ZooKeeper](https://www.slideshare.net/sauravhaloi/introduction-to-apache-zookeeper) 54 | - [Zookeeper 的优缺点](https://blog.csdn.net/wwwsq/article/details/7644445) 55 | 56 | ## 🚪 传送 57 | 58 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 59 | -------------------------------------------------------------------------------- /docs/16.大数据/11.spark/01.Spark简介.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Spark 简介 3 | date: 2019-05-07 20:19:25 4 | categories: 5 | - 大数据 6 | - spark 7 | tags: 8 | - 大数据 9 | - Spark 10 | permalink: /pages/80d4a7/ 11 | --- 12 | 13 | # Spark 14 | 15 | ## Spark 简介 16 | 17 | ### Spark 概念 18 | 19 | - 大规模分布式通用计算引擎 20 | - Spark Core:核心计算框架 21 | - Spark SQL:结构化数据查询 22 | - Spark Streaming:实时流处理 23 | - Spark MLib:机器学习 24 | - Spark GraphX:图计算 25 | - 具有高吞吐、低延时、通用易扩展、高容错等特点 26 | - 采用 Scala 语言开发 27 | - 提供多种运行模式 28 | 29 | ### Spark 特点 30 | 31 | - 计算高效 32 | - 利用内存计算、Cache 缓存机制,支持迭代计算和数据共享,减少数据读取的 IO 开销 33 | - 利用 DAG 引擎,减少中间计算结果写入 HDFS 的开销 34 | - 利用多线程池模型,减少任务启动开销,避免 Shuffle 中不必要的排序和磁盘 IO 操作 35 | - 通用易用 36 | - 适用于批处理、流处理、交互式计算、机器学习算法等场景 37 | - 提供了丰富的开发 API,支持 Scala、Java、Python、R 等 38 | - 运行模式多样 39 | - Local 模式 40 | - Standalone 模式 41 | - YARN/Mesos 模式 42 | - 计算高效 43 | - 利用内存计算、Cache 缓存机制,支持迭代计算和数据共享,减少数据读取的 IO 开销 44 | - 利用 DAG 引擎,减少中间计算结果写入 HDFS 的开销 45 | - 利用多线程池模型,减少任务启动开销,避免 Shuffle 中不必要的排序和磁盘 IO 操作 46 | - 通用易用 47 | - 适用于批处理、流处理、交互式计算、机器学习等场景 48 | - 提供了丰富的开发 API,支持 Scala、Java、Python、R 等 49 | 50 | ## Spark 原理 51 | 52 | ### 编程模型 53 | 54 | #### RDD 55 | 56 | - 弹性分布式数据集(Resilient Distributed Datesets) 57 | - 分布在集群中的只读对象集合 58 | - 由多个 Partition 组成 59 | - 通过转换操作构造 60 | - 失效后自动重构(弹性) 61 | - 存储在内存或磁盘中 62 | - Spark 基于 RDD 进行计算 63 | 64 | #### RDD 操作(Operator) 65 | 66 | - Transformation(转换) 67 | - 将 Scala 集合或 Hadoop 输入数据构造成一个新 RDD 68 | - 通过已有的 RDD 产生新 RDD 69 | - 惰性执行:只记录转换关系,不触发计算 70 | - 例如:map、filter、flatmap、union、distinct、sortbykey 71 | - Action(动作) 72 | - 通过 RDD 计算得到一个值或一组值 73 | - 真正触发计算 74 | - 例如:first、count、collect、foreach、saveAsTextFile 75 | 76 | #### RDD 依赖(Dependency) 77 | 78 | - 窄依赖(Narrow Dependency) 79 | - 父 RDD 中的分区最多只能被一个子 RDD 的一个分区使用 80 | - 子 RDD 如果有部分分区数据丢失或损坏,只需从对应的父 RDD 重新计算恢复 81 | - 例如:map、filter、union 82 | - 宽依赖(Shuffle/Wide Dependency ) 83 | - 子 RDD 分区依赖父 RDD 的所有分区 84 | - 子 RDD 如果部分或全部分区数据丢失或损坏,必须从所有父 RDD 分区重新计算 85 | - 相对于窄依赖,宽依赖付出的代价要高很多,尽量避免使用 86 | - 例如:groupByKey、reduceByKey、sortByKey 87 | -------------------------------------------------------------------------------- /docs/16.大数据/13.flink/07.Flink运维.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Flink 运维 3 | date: 2022-02-21 09:44:33 4 | categories: 5 | - 大数据 6 | - flink 7 | tags: 8 | - 大数据 9 | - Flink 10 | - 运维 11 | permalink: /pages/38ec73/ 12 | --- 13 | 14 | # Flink 运维 15 | 16 | ## docker 安装 flink 17 | 18 | (1)使用 docker 命令拉取镜像 19 | 20 | ```bash 21 | docker pull flink 22 | ``` 23 | 24 | (2)编写 `docker-compose.yml`,内容如下: 25 | 26 | ```yml 27 | version: '2.1' 28 | services: 29 | jobmanager: 30 | image: flink 31 | expose: 32 | - '6123' 33 | ports: 34 | - '8081:8081' 35 | command: jobmanager 36 | environment: 37 | - JOB_MANAGER_RPC_ADDRESS=jobmanager 38 | 39 | taskmanager: 40 | image: flink 41 | expose: 42 | - '6121' 43 | - '6122' 44 | depends_on: 45 | - jobmanager 46 | command: taskmanager 47 | links: 48 | - 'jobmanager:jobmanager' 49 | environment: 50 | - JOB_MANAGER_RPC_ADDRESS=jobmanager 51 | ``` 52 | 53 | (3)执行 docker-compose,命令如下: 54 | 55 | ``` 56 | docker-compose up -d 57 | ``` 58 | 59 | (4)打开浏览器,访问 http://127.0.0.1:8081 60 | 61 | ## Flink 配置 62 | 63 | ### 基础配置 64 | 65 | ```yml 66 | # jobManager 的IP地址 67 | jobmanager.rpc.address: localhost 68 | 69 | # JobManager 的端口号 70 | jobmanager.rpc.port: 6123 71 | 72 | # JobManager JVM heap 内存大小 73 | jobmanager.heap.size: 1024m 74 | 75 | # TaskManager JVM heap 内存大小 76 | taskmanager.heap.size: 1024m 77 | 78 | # 每个 TaskManager 提供的任务 slots 数量大小 79 | taskmanager.numberOfTaskSlots: 1 80 | 81 | # 程序默认并行计算的个数 82 | parallelism.default: 1 83 | # 文件系统来源 84 | # fs.default-scheme 85 | ``` 86 | 87 | ### 高可用配置 88 | 89 | ```yml 90 | # 可以选择 'NONE' 或者 'zookeeper'. 91 | # high-availability: zookeeper 92 | 93 | # 文件系统路径,让 Flink 在高可用性设置中持久保存元数据 94 | # high-availability.storageDir: hdfs:///flink/ha/ 95 | 96 | # zookeeper 集群中仲裁者的机器 ip 和 port 端口号 97 | # high-availability.zookeeper.quorum: localhost:2181 98 | 99 | # 默认是 open,如果 zookeeper security 启用了该值会更改成 creator 100 | # high-availability.zookeeper.client.acl: open 101 | ``` 102 | 103 | ### 容错和 checkpoint 配置 104 | 105 | ```yml 106 | # 用于存储和检查点状态 107 | # state.backend: filesystem 108 | 109 | # 存储检查点的数据文件和元数据的默认目录 110 | # state.checkpoints.dir: hdfs://namenode-host:port/flink-checkpoints 111 | 112 | # savepoints 的默认目标目录(可选) 113 | # state.savepoints.dir: hdfs://namenode-host:port/flink-checkpoints 114 | 115 | # 用于启用/禁用增量 checkpoints 的标志 116 | # state.backend.incremental: false 117 | ``` 118 | 119 | ### Web UI 配置 120 | 121 | ```yml 122 | # 基于 Web 的运行时监视器侦听的地址. 123 | #jobmanager.web.address: 0.0.0.0 124 | 125 | # Web 的运行时监视器端口 126 | rest.port: 8081 127 | # 是否从基于 Web 的 jobmanager 启用作业提交 128 | # jobmanager.web.submit.enable: false 129 | ``` 130 | 131 | ### 高级配置 132 | 133 | ```yml 134 | # io.tmp.dirs: /tmp 135 | 136 | # 是否应在 TaskManager 启动时预先分配 TaskManager 管理的内存 137 | # taskmanager.memory.preallocate: false 138 | 139 | # 类加载解析顺序,是先检查用户代码 jar(“child-first”)还是应用程序类路径(“parent-first”)。 默认设置指示首先从用户代码 jar 加载类 140 | # classloader.resolve-order: child-first 141 | 142 | # 用于网络缓冲区的 JVM 内存的分数。 这决定了 TaskManager 可以同时拥有多少流数据交换通道以及通道缓冲的程度。 如果作业被拒绝或者您收到系统没有足够缓冲区的警告,请增加此值或下面的最小/最大值。 另请注意,“taskmanager.network.memory.min”和“taskmanager.network.memory.max”可能会覆盖此分数 143 | 144 | # taskmanager.network.memory.fraction: 0.1 145 | # taskmanager.network.memory.min: 67108864 146 | # taskmanager.network.memory.max: 1073741824 147 | ``` 148 | 149 | ### Flink 集群安全配置 150 | 151 | ``` 152 | # 指示是否从 Kerberos ticket 缓存中读取 153 | # security.kerberos.login.use-ticket-cache: true 154 | 155 | # 包含用户凭据的 Kerberos 密钥表文件的绝对路径 156 | # security.kerberos.login.keytab: /path/to/kerberos/keytab 157 | 158 | # 与 keytab 关联的 Kerberos 主体名称 159 | # security.kerberos.login.principal: flink-user 160 | 161 | # 以逗号分隔的登录上下文列表,用于提供 Kerberos 凭据(例如,`Client,KafkaClient`使用凭证进行 ZooKeeper 身份验证和 Kafka 身份验证) 162 | # security.kerberos.login.contexts: Client,KafkaClient 163 | ``` 164 | 165 | ### Zookeeper 安全配置 166 | 167 | ```yml 168 | # 覆盖以下配置以提供自定义 ZK 服务名称 169 | # zookeeper.sasl.service-name: zookeeper 170 | 171 | # 该配置必须匹配 "security.kerberos.login.contexts" 中的列表(含有一个) 172 | # zookeeper.sasl.login-context-name: Client 173 | ``` 174 | 175 | ## 参考资料 176 | 177 | - [Flink 官方配置文档](https://ci.apache.org/projects/flink/flink-docs-stable/ops/config.html) 178 | - [Apache Flink Docker Github](https://github.com/apache/flink-docker) 179 | - [借助 Docker 学习大数据:Flink](https://zhuanlan.zhihu.com/p/176855301) 180 | -------------------------------------------------------------------------------- /docs/16.大数据/13.flink/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Flink 教程 3 | date: 2022-02-21 09:30:27 4 | categories: 5 | - 大数据 6 | - flink 7 | tags: 8 | - 大数据 9 | - Flink 10 | permalink: /pages/5c85bd/ 11 | hidden: true 12 | --- 13 | 14 | # Flink 教程 15 | 16 | > Apache Flink 是一个框架和分布式处理引擎,用于在**无边界**和**有边界**数据流上进行**有状态**的计算。Flink 能在所有常见集群环境中运行,并能以内存速度和任意规模进行计算。 17 | 18 | ## 📖 内容 19 | 20 | ### [Flink 入门](01.Flink入门.md) 21 | 22 | ### [Flink 简介](02.Flink简介.md) 23 | 24 | ### [Flink ETL](03.FlinkETL.md) 25 | 26 | ### [Flink 事件驱动](04.Flink事件驱动.md) 27 | 28 | ### [Flink API](05.FlinkApi.md) 29 | 30 | ### [Flink 架构](06.Flink架构.md) 31 | 32 | ### [Flink 运维](07.Flink运维.md) 33 | 34 | ### [Flink Table API & SQL](08.FlinkTableApi.md) 35 | 36 | ## 📚 资料 37 | 38 | - **官方** 39 | - [Flink Github](https://github.com/apache/flink) 40 | - [Flink 官方文档](https://nightlies.apache.org/flink/flink-docs-release-1.14/zh/) 41 | - **教程** 42 | - [flink-learning](https://github.com/zhisheng17/flink-learning) 43 | - [flink-training-course](https://github.com/flink-china/flink-training-course) - Flink 中文视频课程 44 | 45 | ## 🚪 传送 46 | 47 | ◾ 💧 [钝悟的 IT 知识图谱](https://dunwu.github.io/waterdrop/) ◾ 🎯 [钝悟的博客](https://dunwu.github.io/blog/) ◾ 48 | -------------------------------------------------------------------------------- /docs/16.大数据/99.其他/01.flume.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Flume 3 | date: 2019-05-07 20:19:25 4 | categories: 5 | - 大数据 6 | - 其他 7 | tags: 8 | - 大数据 9 | - Flume 10 | permalink: /pages/ac5a41/ 11 | --- 12 | 13 | # Flume 14 | 15 | > **Sqoop 是一个主要在 Hadoop 和关系数据库之间进行批量数据迁移的工具。** 16 | 17 | ## Flume 简介 18 | 19 | ### 什么是 Flume ? 20 | 21 | Flume 是一个分布式海量数据采集、聚合和传输系统。 22 | 23 | 特点 24 | 25 | - 基于事件的海量数据采集 26 | - 数据流模型:Source -> Channel -> Sink 27 | - 事务机制:支持重读重写,保证消息传递的可靠性 28 | - 内置丰富插件:轻松与各种外部系统集成 29 | - 高可用:Agent 主备切换 30 | - Java 实现:开源,优秀的系统设计 31 | 32 | ### 应用场景 33 | 34 | ## Flume 原理 35 | 36 | ### Flume 基本概念 37 | 38 | - Event:事件,最小数据传输单元,由 Header 和 Body 组成。 39 | - Agent:代理,JVM 进程,最小运行单元,由 Source、Channel、Sink 三个基本组件构成,负责将外部数据源产生的数据以 Event 的形式传输到目的地 40 | - Source:负责对接各种外部数据源,将采集到的数据封装成 Event,然后写入 Channel 41 | - Channel:Event 暂存容器,负责保存 Source 发送的 Event,直至被 Sink 成功读取 42 | - Sink:负责从 Channel 读取 Event,然后将其写入外部存储,或传输给下一阶段的 Agent 43 | - 映射关系:1 个 Source -> 多个 Channel,1 个 Channel -> 多个 Sink,1 个 Sink -> 1 个 Channel 44 | 45 | ### Flume 基本组件 46 | 47 | #### Source 组件 48 | 49 | - 对接各种外部数据源,将采集到的数据封装成 Event,然后写入 Channel 50 | - 一个 Source 可向多个 Channel 发送 Event 51 | - Flume 内置类型丰富的 Source,同时用户可自定义 Source 52 | 53 | #### Channel 组件 54 | 55 | - Event 中转暂存区,存储 Source 采集但未被 Sink 读取的 Event 56 | - 为了平衡 Source 采集、Sink 读取的速度,可视为 Flume 内部的消息队列 57 | - 线程安全并具有事务性,支持 Source 写失败重写和 Sink 读失败重读 58 | 59 | #### Sink 组件 60 | 61 | - 从 Channel 读取 Event,将其写入外部存储,或传输到下一阶段的 Agent 62 | - 一个 Sink 只能从一个 Channel 中读取 Event 63 | - Sink 成功读取 Event 后,向 Channel 提交事务,Event 被删除,否则 Channel 会等待 Sink 重新读取 64 | 65 | ### Flume 数据流 66 | 67 | 单层架构 68 | 69 | 优点:架构简单,使用方便,占用资源较少 70 | 缺点 71 | 如果采集的数据源或 Agent 较多,将 Event 写入到 HDFS 会产生很多小文件 72 | 外部存储升级维护或发生故障,需对采集层的所有 Agent 做处理,人力成本较高,系统稳定性较差 73 | 系统安全性较差 74 | 数据源管理较混乱 75 | -------------------------------------------------------------------------------- /docs/16.大数据/99.其他/02.sqoop.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: sqoop 3 | date: 2020-09-09 17:53:08 4 | categories: 5 | - 大数据 6 | - 其他 7 | tags: 8 | - 大数据 9 | - Sqoop 10 | permalink: /pages/773408/ 11 | --- 12 | 13 | # Sqoop 14 | 15 | > **Sqoop 是一个主要在 Hadoop 和关系数据库之间进行批量数据迁移的工具。** 16 | 17 | ## Sqoop 简介 18 | 19 | **Sqoop 是一个主要在 Hadoop 和关系数据库之间进行批量数据迁移的工具。** 20 | 21 | - Hadoop:HDFS、Hive、HBase、Inceptor、Hyperbase 22 | - 面向大数据集的批量导入导出 23 | - 将输入数据集分为 N 个切片,然后启动 N 个 Map 任务并行传输 24 | - 支持全量、增量两种传输方式 25 | 26 | ### 提供多种 Sqoop 连接器 27 | 28 | #### 内置连接器 29 | 30 | - 经过优化的专用 RDBMS 连接器:MySQL、PostgreSQL、Oracle、DB2、SQL Server、Netzza 等 31 | - 通用的 JDBC 连接器:支持 JDBC 协议的数据库 32 | 33 | #### 第三方连接器 34 | 35 | - 数据仓库:Teradata 36 | - NoSQL 数据库:Couchbase 37 | 38 | ### Sqoop 版本 39 | 40 | #### Sqoop 1 优缺点 41 | 42 | ![img](https://raw.githubusercontent.com/dunwu/images/master/cs/bigdata/Sqoop/sqoop-architecture.png) 43 | 44 | 优点 45 | 46 | - 架构简单 47 | - 部署简单 48 | - 功能全面 49 | - 稳定性较高 50 | - 速度较快 51 | 52 | 缺点 53 | 54 | - 访问方式单一 55 | - 命令行方式容易出错,格式紧耦合 56 | - 安全机制不够完善,存在密码泄露风险 57 | 58 | #### Sqoop 2 优缺点 59 | 60 | ![img](https://raw.githubusercontent.com/dunwu/images/master/cs/bigdata/Sqoop/sqoop-v2-architecture.png) 61 | 62 | 优点 63 | 64 | - 访问方式多样 65 | - 集中管理连接器 66 | - 安全机制较完善 67 | - 支持多用户 68 | 69 | 缺点 70 | 71 | - 架构较复杂 72 | - 部署较繁琐 73 | - 稳定性一般 74 | - 速度一般 75 | 76 | ## Sqoop 原理 77 | 78 | ### 导入 79 | 80 | ![img](https://raw.githubusercontent.com/dunwu/images/master/cs/bigdata/Sqoop/sqoop-import.png) 81 | 82 | ### 导出 83 | 84 | ![img](https://raw.githubusercontent.com/dunwu/images/master/cs/bigdata/Sqoop/sqoop-export.png) 85 | -------------------------------------------------------------------------------- /docs/@pages/archivesPage.md: -------------------------------------------------------------------------------- 1 | --- 2 | archivesPage: true 3 | title: 归档 4 | permalink: /archives/ 5 | article: false 6 | --- 7 | -------------------------------------------------------------------------------- /docs/@pages/categoriesPage.md: -------------------------------------------------------------------------------- 1 | --- 2 | categoriesPage: true 3 | title: 分类 4 | permalink: /categories/ 5 | article: false 6 | --- 7 | -------------------------------------------------------------------------------- /docs/@pages/tagsPage.md: -------------------------------------------------------------------------------- 1 | --- 2 | tagsPage: true 3 | title: 标签 4 | permalink: /tags/ 5 | article: false 6 | --- 7 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bigdata-tutorial", 3 | "version": "1.0.0", 4 | "private": true, 5 | "scripts": { 6 | "clean": "rimraf docs/.temp", 7 | "start": "vuepress dev docs", 8 | "build": "vuepress build docs", 9 | "deploy": "bash scripts/deploy.sh", 10 | "updateTheme": "yarn remove vuepress-theme-vdoing && rm -rf node_modules && yarn && yarn add vuepress-theme-vdoing -D", 11 | "editFm": "node utils/editFrontmatter.js", 12 | "lint": "markdownlint -r markdownlint-rule-emphasis-style -c docs/.markdownlint.json **/*.md -i node_modules", 13 | "lint:fix": "markdownlint -f -r markdownlint-rule-emphasis-style -c docs/.markdownlint.json **/*.md -i node_modules", 14 | "show-help": "vuepress --help", 15 | "view-info": "vuepress view-info ./ --temp docs/.temp" 16 | }, 17 | "devDependencies": { 18 | "dayjs": "^1.11.7", 19 | "inquirer": "^7.1.0", 20 | "json2yaml": "^1.1.0", 21 | "markdownlint-cli": "^0.25.0", 22 | "markdownlint-rule-emphasis-style": "^1.0.1", 23 | "rimraf": "^3.0.1", 24 | "vue-toasted": "^1.1.25", 25 | "vuepress": "^1.9.8", 26 | "vuepress-plugin-baidu-tongji": "^1.0.1", 27 | "vuepress-plugin-comment": "^0.7.3", 28 | "vuepress-plugin-demo-block": "^0.7.2", 29 | "vuepress-plugin-flowchart": "^1.4.2", 30 | "vuepress-plugin-fulltext-search": "^2.2.1", 31 | "vuepress-plugin-one-click-copy": "^1.0.2", 32 | "vuepress-plugin-thirdparty-search": "^1.0.2", 33 | "vuepress-plugin-zooming": "^1.1.7", 34 | "vuepress-theme-vdoing": "^1.12.8", 35 | "yamljs": "^0.3.0" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | io.github.dunwu.bigdata 7 | bigdata-tutorial 8 | 大数据 9 | 1.0.0 10 | pom 11 | 12 | 13 | codes/hbase 14 | codes/kafka 15 | codes/zookeeper 16 | codes/flink 17 | 18 | 19 | -------------------------------------------------------------------------------- /prettier.config.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @see https://prettier.io/docs/en/options.html 3 | * @see https://prettier.io/docs/en/configuration.html 4 | */ 5 | module.exports = { 6 | tabWidth: 2, 7 | semi: false, 8 | singleQuote: true, 9 | trailingComma: 'none' 10 | } 11 | -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | # ------------------------------------------------------------------------------ 4 | # gh-pages 部署脚本 5 | # @author Zhang Peng 6 | # @since 2020/2/10 7 | # ------------------------------------------------------------------------------ 8 | 9 | # 装载其它库 10 | ROOT_DIR=$( 11 | cd $(dirname $0)/.. 12 | pwd 13 | ) 14 | 15 | # 确保脚本抛出遇到的错误 16 | set -e 17 | 18 | # 生成静态文件 19 | npm run build 20 | 21 | # 进入生成的文件夹 22 | cd ${ROOT_DIR}/docs/.temp 23 | 24 | # 如果是发布到自定义域名 25 | # echo 'www.example.com' > CNAME 26 | 27 | if [[ ${GITHUB_TOKEN} && ${GITEE_TOKEN} ]]; then 28 | msg='自动部署' 29 | GITHUB_URL=https://dunwu:${GITHUB_TOKEN}@github.com/dunwu/bigdata-tutorial.git 30 | GITEE_URL=https://turnon:${GITEE_TOKEN}@gitee.com/turnon/bigdata-tutorial.git 31 | git config --global user.name "dunwu" 32 | git config --global user.email "forbreak@163.com" 33 | else 34 | msg='手动部署' 35 | GITHUB_URL=git@github.com:dunwu/bigdata-tutorial.git 36 | GITEE_URL=git@gitee.com:turnon/bigdata-tutorial.git 37 | fi 38 | git init 39 | git add -A 40 | git commit -m "${msg}" 41 | # 推送到github gh-pages分支 42 | git push -f "${GITHUB_URL}" master:gh-pages 43 | git push -f "${GITEE_URL}" master:gh-pages 44 | 45 | cd - 46 | rm -rf ${ROOT_DIR}/docs/.temp 47 | -------------------------------------------------------------------------------- /utils/config.yml: -------------------------------------------------------------------------------- 1 | #批量添加和修改、删除front matter配置文件 2 | 3 | # 需要批量处理的路径,docs文件夹内的文件夹 (数组。映射路径:docs/arr[0]/arr[1] ... ) 4 | path: 5 | - docs # 第一个成员必须是docs 6 | 7 | # 要删除的字段 (数组) 8 | delete: 9 | - categories 10 | # - tags 11 | 12 | # 要添加、修改front matter的数据 (front matter中没有的数据则添加,已有的数据则覆盖) 13 | #data: 14 | # article: false 15 | -------------------------------------------------------------------------------- /utils/editFrontmatter.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 批量添加和修改front matter ,需要配置 ./config.yml 文件。 3 | */ 4 | const fs = require('fs'); // 文件模块 5 | const path = require('path'); // 路径模块 6 | const matter = require('gray-matter'); // front matter解析器 https://github.com/jonschlinkert/gray-matter 7 | const jsonToYaml = require('json2yaml') 8 | const yamlToJs = require('yamljs') 9 | const inquirer = require('inquirer') // 命令行操作 10 | const chalk = require('chalk') // 命令行打印美化 11 | const readFileList = require('./modules/readFileList'); 12 | const { type, repairDate} = require('./modules/fn'); 13 | const log = console.log 14 | 15 | const configPath = path.join(__dirname, 'config.yml') // 配置文件的路径 16 | 17 | main(); 18 | 19 | /** 20 | * 主体函数 21 | */ 22 | async function main() { 23 | 24 | const promptList = [{ 25 | type: "confirm", 26 | message: chalk.yellow('批量操作frontmatter有修改数据的风险,确定要继续吗?'), 27 | name: "edit", 28 | }]; 29 | let edit = true; 30 | 31 | await inquirer.prompt(promptList).then(answers => { 32 | edit = answers.edit 33 | }) 34 | 35 | if(!edit) { // 退出操作 36 | return 37 | } 38 | 39 | const config = yamlToJs.load(configPath) // 解析配置文件的数据转为js对象 40 | 41 | if (type(config.path) !== 'array') { 42 | log(chalk.red('路径配置有误,path字段应该是一个数组')) 43 | return 44 | } 45 | 46 | if (config.path[0] !== 'docs') { 47 | log(chalk.red("路径配置有误,path数组的第一个成员必须是'docs'")) 48 | return 49 | } 50 | 51 | const filePath = path.join(__dirname, '..', ...config.path); // 要批量修改的文件路径 52 | const files = readFileList(filePath); // 读取所有md文件数据 53 | 54 | files.forEach(file => { 55 | let dataStr = fs.readFileSync(file.filePath, 'utf8');// 读取每个md文件的内容 56 | const fileMatterObj = matter(dataStr) // 解析md文件的front Matter。 fileMatterObj => {content:'剔除frontmatter后的文件内容字符串', data:{}, ...} 57 | let matterData = fileMatterObj.data; // 得到md文件的front Matter 58 | 59 | let mark = false 60 | // 删除操作 61 | if (config.delete) { 62 | if( type(config.delete) !== 'array' ) { 63 | log(chalk.yellow('未能完成删除操作,delete字段的值应该是一个数组!')) 64 | } else { 65 | config.delete.forEach(item => { 66 | if (matterData[item]) { 67 | delete matterData[item] 68 | mark = true 69 | } 70 | }) 71 | 72 | } 73 | } 74 | 75 | // 添加、修改操作 76 | if (type(config.data) === 'object') { 77 | Object.assign(matterData, config.data) // 将配置数据合并到front Matter对象 78 | mark = true 79 | } 80 | 81 | // 有操作时才继续 82 | if (mark) { 83 | if(matterData.date && type(matterData.date) === 'date') { 84 | matterData.date = repairDate(matterData.date) // 修复时间格式 85 | } 86 | const newData = jsonToYaml.stringify(matterData).replace(/\n\s{2}/g,"\n").replace(/"/g,"") + '---\r\n' + fileMatterObj.content; 87 | fs.writeFileSync(file.filePath, newData); // 写入 88 | log(chalk.green(`update frontmatter:${file.filePath} `)) 89 | } 90 | 91 | }) 92 | } 93 | -------------------------------------------------------------------------------- /utils/modules/fn.js: -------------------------------------------------------------------------------- 1 | // 类型判断 2 | exports.type = function (o){ 3 | var s = Object.prototype.toString.call(o) 4 | return s.match(/\[object (.*?)\]/)[1].toLowerCase() 5 | } 6 | 7 | // 修复date时区格式的问题 8 | exports.repairDate = function (date) { 9 | date = new Date(date); 10 | return `${date.getUTCFullYear()}-${zero(date.getUTCMonth()+1)}-${zero(date.getUTCDate())} ${zero(date.getUTCHours())}:${zero(date.getUTCMinutes())}:${zero(date.getUTCSeconds())}`; 11 | } 12 | 13 | // 日期的格式 14 | exports.dateFormat = function (date) { 15 | return `${date.getFullYear()}-${zero(date.getMonth()+1)}-${zero(date.getDate())} ${zero(date.getHours())}:${zero(date.getMinutes())}:${zero(date.getSeconds())}` 16 | } 17 | 18 | // 小于10补0 19 | function zero(d){ 20 | return d.toString().padStart(2,'0') 21 | } -------------------------------------------------------------------------------- /utils/modules/readFileList.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 读取所有md文件数据 3 | */ 4 | const fs = require('fs'); // 文件模块 5 | const path = require('path'); // 路径模块 6 | const docsRoot = path.join(__dirname, '..', '..', 'docs'); // docs文件路径 7 | 8 | function readFileList(dir = docsRoot, filesList = []) { 9 | const files = fs.readdirSync(dir); 10 | files.forEach( (item, index) => { 11 | let filePath = path.join(dir, item); 12 | const stat = fs.statSync(filePath); 13 | if (stat.isDirectory() && item !== '.vuepress') { 14 | readFileList(path.join(dir, item), filesList); //递归读取文件 15 | } else { 16 | if(path.basename(dir) !== 'docs'){ // 过滤docs目录级下的文件 17 | 18 | const fileNameArr = path.basename(filePath).split('.') 19 | let name = null, type = null; 20 | if (fileNameArr.length === 2) { // 没有序号的文件 21 | name = fileNameArr[0] 22 | type = fileNameArr[1] 23 | } else if (fileNameArr.length === 3) { // 有序号的文件 24 | name = fileNameArr[1] 25 | type = fileNameArr[2] 26 | } else { // 超过两个‘.’的 27 | log(chalk.yellow(`warning: 该文件 "${filePath}" 没有按照约定命名,将忽略生成相应数据。`)) 28 | return 29 | } 30 | if(type === 'md'){ // 过滤非md文件 31 | filesList.push({ 32 | name, 33 | filePath 34 | }); 35 | } 36 | 37 | } 38 | } 39 | }); 40 | return filesList; 41 | } 42 | 43 | module.exports = readFileList; --------------------------------------------------------------------------------