├── README.md ├── log └── test.log ├── pom.xml └── src └── main ├── java └── com │ └── tools │ ├── clickhouse │ └── 环境搭建.md │ ├── hadoop │ ├── config │ │ ├── core-site.xml │ │ ├── hdfs-site.xml │ │ ├── mapred-site.xml │ │ └── yarn-site.xml │ ├── hdfs │ │ ├── HDFSClient.java │ │ ├── HDFSStream.java │ │ └── UploadFileToHdfsByCompress.java │ ├── mr │ │ ├── HFileGenerator.java │ │ ├── HFileImportMapper.java │ │ ├── dataclean │ │ │ └── DataClean.java │ │ ├── duplicate │ │ │ └── DuplicateRemoval.java │ │ ├── inputformat │ │ │ ├── SmallFiles2SequenceFile.java │ │ │ ├── SmallFiles2SequenceMapper.java │ │ │ ├── WholeFileInputFormat.java │ │ │ └── WholeRecordReader.java │ │ ├── outputformat │ │ │ ├── MyFileOutputFormat.java │ │ │ ├── MyFileOutputFormatMain.java │ │ │ └── MyRecordWriter.java │ │ ├── secondarysort │ │ │ ├── MyOrder.java │ │ │ ├── MyOrderGroup.java │ │ │ ├── MyOrderMapper.java │ │ │ ├── MyOrderReducer.java │ │ │ ├── MyPartitioner.java │ │ │ └── OrderBean.java │ │ └── wordcount │ │ │ ├── LogCount.java │ │ │ ├── LogCountMapper.java │ │ │ ├── LogCountReducer.java │ │ │ ├── MyPartitioner.java │ │ │ ├── WordCountByPartitioner.java │ │ │ ├── WordCountMapper.java │ │ │ └── WordCountReducer.java │ └── 集群部署文档.md │ ├── hbase │ ├── HBaseFilter.java │ ├── HBaseReadWrite.java │ ├── HBaseTestUtil.java │ ├── HBase读写的几种方式.pdf │ ├── Utils.java │ ├── hdfs2hbase │ │ ├── HDFS2HBase.java │ │ └── scores.txt │ └── processor │ │ ├── HBasePerson.java │ │ └── HBaseProcessor.java │ ├── hive │ ├── MyUDAF.java │ ├── MyUDF.java │ ├── MyUDTF.java │ └── sql │ │ └── test.sql │ ├── kafka │ ├── CustomPartitioner.java │ ├── consumer │ │ ├── ConsumerCommitOffset.java │ │ ├── ConsumerInterceptorTTL.java │ │ ├── ConsumerReBalance.java │ │ └── ConsumerThread.java │ ├── producer │ │ └── ProducerRandomInt.java │ ├── quickstart │ │ ├── ConsumerQuickStart.java │ │ └── ProducerQuickStart.java │ └── readme.md │ ├── redis │ ├── CacheTuning.java │ ├── DistributedTool.java │ ├── lettuce │ │ ├── LettuceTools.java │ │ ├── MyListener.java │ │ ├── PubSubByLettuce.java │ │ ├── QuickStartByLettuce.java │ │ └── TransactionsByLettuce.java │ ├── redisson │ │ ├── RedissonDelayQueue.java │ │ └── RedissonDelayQueueConsumer.java │ ├── redis持久化详解.md │ ├── redis的五大数据类型.md │ └── 分布式数据库与缓存双写一致性方案.md │ └── zookeeper │ ├── discovery │ ├── client │ │ └── DistributeClient.java │ ├── server │ │ └── DistributeServer.java │ └── 服务注册与发现.md │ ├── election │ ├── Broker_1.java │ ├── Broker_2.java │ ├── Broker_3.java │ └── ZkElectionUtil.java │ └── zookeeper选举机制.pdf └── resources └── log4j2.xml /README.md: -------------------------------------------------------------------------------- 1 | ### Hadoop Practice 2 | 3 | - [hadoop集群部署](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hadoop/%E9%9B%86%E7%BE%A4%E9%83%A8%E7%BD%B2%E6%96%87%E6%A1%A3.md) 4 | - [HDFS](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/hadoop/hdfs) 5 | - [MapReduce](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/hadoop/mr) 6 | 7 | ### HBase Practice 8 | - [HBase读写操作](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/hbase) 9 | - [HBase过滤器](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hbase/HBaseFilter.java) 10 | - [HBase协处理器](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hbase/processor/HBaseProcessor.java) 11 | 12 | ### Hive Practice 13 | 14 | 15 | 16 | ### Kafka Practice 17 | 18 | - [生产者/消费者QuickStart](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/kafka) 19 | - [自定义消费者消息过期拦截器](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/kafka/consumer/ConsumerInterceptorTTL.java) 20 | - [Kafka ReBalance监听器](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/kafka/consumer/ConsumerReBalance.java) 21 | - [Kafka 消费者多线程消费](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/kafka/consumer/ConsumerThread.java) 22 | 23 | ### Click House 24 | 25 | - [docker部署文档](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/clickhouse/%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA.md) 26 | 27 | ### Redis Practice 28 | 29 | - [缓存击穿,缓存穿透,缓存雪崩的解决方案](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/redis) 30 | - [分布式数据库与缓存双写一致性解决方案](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/redis) 31 | - [使用redis简单正确实现分布式锁](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/redis) 32 | - [使用Lettuce作为redis客户端实例](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/redis) 33 | - [redis持久化详解](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/redis) 34 | 35 | ### Zookeeper Practice 36 | 37 | - [基于zookeeper的服务注册与发现](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/zookeeper/discovery) 38 | - [基于zookeeper实现leader选举](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/zookeeper/election) 39 | - [zookeeper的leader选举机制](https://github.com/sev7e0/bigdata-practice/tree/master/src/main/java/com/tools/zookeeper/zookeeper选举机制.pdf) -------------------------------------------------------------------------------- /log/test.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/log/test.log -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | com.sev7e0 6 | bigdata-practice 7 | 0 8 | 9 | 10 | 11 | org.apache.maven.plugins 12 | maven-compiler-plugin 13 | 14 | 8 15 | 8 16 | 17 | 18 | 19 | 20 | 21 | 22 | cloudera 23 | https://repository.cloudera.com/artifactory/cloudera-repos/ 24 | 25 | 26 | 27 | 2.7.5 28 | 1.2.6 29 | 2.2.0 30 | 2.10.0 31 | 1.16.20 32 | 3.4.14 33 | 16.0 34 | 5.1.6.RELEASE 35 | 1.7.22 36 | 2.17.0 37 | 2.3.2 38 | 3.1.2 39 | 40 | 41 | 42 | 43 | 44 | org.apache.kafka 45 | kafka_2.12 46 | ${kafka.version} 47 | 48 | 49 | com.fasterxml.jackson.core 50 | * 51 | 52 | 53 | 54 | 55 | 56 | org.apache.hadoop 57 | hadoop-client 58 | ${hadoop.version} 59 | 60 | 61 | io.netty 62 | * 63 | 64 | 65 | slf4j-log4j12 66 | org.slf4j 67 | 68 | 69 | 70 | 71 | 72 | org.apache.hbase 73 | hbase-server 74 | ${hbase.version} 75 | 76 | 77 | io.netty 78 | * 79 | 80 | 81 | slf4j-log4j12 82 | org.slf4j 83 | 84 | 85 | 86 | 87 | org.apache.hbase 88 | hbase-client 89 | ${hbase.version} 90 | 91 | 92 | io.netty 93 | * 94 | 95 | 96 | slf4j-log4j12 97 | org.slf4j 98 | 99 | 100 | 101 | 102 | 103 | 104 | org.apache.hive 105 | hive-exec 106 | ${hive.version} 107 | 108 | 109 | slf4j-log4j12 110 | org.slf4j 111 | 112 | 113 | org.glassfish 114 | javax.el 115 | 116 | 117 | 118 | 119 | org.apache.hive 120 | hive-jdbc 121 | ${hive.version} 122 | 123 | 124 | slf4j-log4j12 125 | org.slf4j 126 | 127 | 128 | org.glassfish 129 | javax.el 130 | 131 | 132 | 133 | 134 | org.apache.hive 135 | hive-cli 136 | ${hive.version} 137 | 138 | 139 | slf4j-log4j12 140 | org.slf4j 141 | 142 | 143 | org.glassfish 144 | javax.el 145 | 146 | 147 | 148 | 149 | 150 | 151 | org.apache.zookeeper 152 | zookeeper 153 | ${zookeeper.version} 154 | 155 | 156 | io.netty 157 | * 158 | 159 | 160 | slf4j-log4j12 161 | org.slf4j 162 | 163 | 164 | 165 | 166 | org.apache.curator 167 | curator-framework 168 | 4.0.0 169 | 170 | 171 | slf4j-log4j12 172 | org.slf4j 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | redis.clients 185 | jedis 186 | ${redis.verison} 187 | 188 | 189 | 190 | io.lettuce 191 | lettuce-core 192 | ${lettuce.version} 193 | 194 | 195 | slf4j-log4j12 196 | org.slf4j 197 | 198 | 199 | 200 | 201 | org.slf4j 202 | slf4j-api 203 | ${slf4j.version} 204 | 205 | 206 | org.apache.curator 207 | curator-recipes 208 | 4.0.0 209 | 210 | 211 | slf4j-log4j12 212 | org.slf4j 213 | 214 | 215 | 216 | 217 | org.apache.pulsar 218 | pulsar-client 219 | ${pulsar.version} 220 | 221 | 222 | slf4j-log4j12 223 | org.slf4j 224 | 225 | 226 | 227 | 228 | com.alibaba 229 | fastjson 230 | 1.2.33 231 | 232 | 233 | slf4j-log4j12 234 | org.slf4j 235 | 236 | 237 | 238 | 239 | org.glassfish 240 | javax.el 241 | 3.0.1-b06 242 | 243 | 244 | slf4j-log4j12 245 | org.slf4j 246 | 247 | 248 | 249 | 250 | org.redisson 251 | redisson 252 | 3.14.0 253 | 254 | 255 | slf4j-log4j12 256 | org.slf4j 257 | 258 | 259 | 260 | 261 | org.projectlombok 262 | lombok 263 | 1.18.4 264 | 265 | 266 | org.apache.logging.log4j 267 | log4j-core 268 | ${log4j2.version} 269 | 270 | 271 | org.apache.logging.log4j 272 | log4j-api 273 | ${log4j2.version} 274 | 275 | 276 | 277 | -------------------------------------------------------------------------------- /src/main/java/com/tools/clickhouse/环境搭建.md: -------------------------------------------------------------------------------- 1 | ## Click House docker环境搭建 2 | 3 | ### 服务搭建 4 | 5 | ```shell 6 | docker run \ 7 | --name clickhouse-docker \ 8 | -d \ 9 | -p 8123:8123 \ 10 | -p 9000:9000 \ 11 | -p 9009:9009 \ 12 | --ulimit nofile=262144:262144 \ 13 | -v $PWD/data/:/var/lib/clickhouse yandex/clickhouse-server 14 | ``` 15 | 16 | ### 原生客户端搭建 17 | 18 | ```shell 19 | docker run \ 20 | -it --rm --link clickhouse-docker:clickhouse-server \ 21 | yandex/clickhouse-client --host clickhouse-docker 22 | ``` -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/config/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | fs.defaultFS 5 | hdfs://ns1 6 | 7 | 8 | hadoop.tmp.dir 9 | /hadoopadmin/datadir/hadoop/tempDatas 10 | 11 | 12 | 13 | io.file.buffer.size 14 | 4096 15 | 16 | 17 | fs.trash.interval 18 | 10080 19 | 检查点被删除后的分钟数。 如果为零,垃圾桶功能将被禁用。 20 | 该选项可以在服务器和客户端上配置。 如果垃圾箱被禁用服务器端,则检查客户端配置。 21 | 如果在服务器端启用垃圾箱,则会使用服务器上配置的值,并忽略客户端配置值。 22 | 23 | 24 | 25 | ha.zookeeper.quorum 26 | node01:2181,node02:2181,node03:2181 27 | 28 | 29 | fs.trash.checkpoint.interval 30 | 0 31 | 垃圾检查点之间的分钟数。 应该小于或等于fs.trash.interval。 32 | 如果为零,则将该值设置为fs.trash.interval的值。 每次检查指针运行时, 33 | 它都会从当前创建一个新的检查点,并删除比fs.trash.interval更早创建的检查点。 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/config/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | dfs.nameservices 5 | ns1 6 | 7 | 8 | 9 | dfs.ha.namenodes.ns1 10 | nn1,nn2 11 | 12 | 13 | 14 | dfs.namenode.rpc-address.ns1.nn1 15 | node01:9000 16 | 17 | 18 | 19 | dfs.namenode.http-address.ns1.nn1 20 | node01:50070 21 | 22 | 23 | 24 | dfs.namenode.rpc-address.ns1.nn2 25 | node02:9000 26 | 27 | 28 | 29 | dfs.namenode.http-address.ns1.nn2 30 | node02:50070 31 | 32 | 33 | dfs.namenode.secondary.http-address 34 | node01:50090 35 | 36 | 37 | dfs.namenode.http-address 38 | node01:50070 39 | 40 | 41 | dfs.namenode.name.dir 42 | file:///hadoopadmin/datadir/hadoop/namenodeDatas 43 | 44 | 45 | 46 | dfs.datanode.data.dir 47 | file:///hadoopadmin/datadir/hadoop/datanodeDatas 48 | 49 | 50 | dfs.namenode.edits.dir 51 | file:///hadoopadmin/datadir/hadoop/dfs/nn/edits 52 | 53 | 54 | dfs.namenode.checkpoint.dir 55 | file:///hadoopadmin/datadir/hadoop/dfs/snn/name 56 | 57 | 58 | dfs.namenode.checkpoint.edits.dir 59 | file:///hadoopadmin/datadir/hadoop/dfs/nn/snn/edits 60 | 61 | 62 | dfs.replication 63 | 3 64 | 65 | 66 | dfs.permissions 67 | false 68 | 69 | 70 | dfs.blocksize 71 | 134217728 72 | 73 | 74 | 75 | dfs.namenode.shared.edits.dir 76 | qjournal://node01:8485;node02:8485;node03:8485/ns1 77 | 78 | 79 | 80 | dfs.journalnode.edits.dir 81 | /hadoopadmin/datadir/hadoop/journal 82 | 83 | 84 | 85 | dfs.ha.automatic-failover.enabled 86 | true 87 | 88 | 89 | 90 | dfs.client.failover.proxy.provider.ns1 91 | org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider 92 | 93 | 94 | 95 | dfs.ha.fencing.methods 96 | 97 | sshfence 98 | shell(/bin/true) 99 | 100 | 101 | 102 | 103 | dfs.ha.fencing.ssh.private-key-files 104 | /home/hadoopadmin/.ssh/id_rsa 105 | 106 | 107 | 108 | dfs.ha.fencing.ssh.connect-timeout 109 | 30000 110 | 111 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/config/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | mapreduce.framework.name 5 | yarn 6 | 7 | 8 | mapreduce.job.ubertask.enable 9 | true 10 | 11 | 12 | mapreduce.jobhistory.address 13 | node01:10020 14 | 15 | 16 | mapreduce.jobhistory.webapp.address 17 | node01:19888 18 | 19 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/config/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.resourcemanager.ha.enabled 5 | true 6 | 7 | 8 | 9 | yarn.resourcemanager.cluster-id 10 | cluster1 11 | 12 | 13 | 14 | yarn.resourcemanager.ha.rm-ids 15 | rm1,rm2 16 | 17 | 18 | 19 | yarn.resourcemanager.hostname.rm1 20 | node01 21 | 22 | 23 | yarn.resourcemanager.hostname.rm2 24 | node02 25 | 26 | 27 | yarn.resourcemanager.recovery.enabled 28 | true 29 | 30 | 31 | yarn.resourcemanager.store.class 32 | org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore 33 | 34 | 35 | 36 | yarn.resourcemanager.zk-address 37 | node01:2181,node02:2181,node03:2181 38 | 39 | 40 | yarn.nodemanager.aux-services 41 | mapreduce_shuffle 42 | 43 | 44 | yarn.log-aggregation-enable 45 | true 46 | 47 | 48 | yarn.log.server.url 49 | http://node01:19888/jobhistory/logs 50 | 51 | 52 | 53 | yarn.log-aggregation.retain-seconds 54 | 2592000 55 | 56 | 57 | 58 | 59 | yarn.nodemanager.log.retain-seconds 60 | 604800 61 | 62 | 63 | 64 | 65 | yarn.nodemanager.log-aggregation.compression-type 66 | gz 67 | 68 | 69 | 70 | yarn.nodemanager.local-dirs 71 | /hadoopadmin/datadir/hadoop/yarn/local 72 | 73 | 74 | 75 | yarn.resourcemanager.max-completed-applications 76 | 1000 77 | 78 | 106 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/hdfs/HDFSClient.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.hdfs; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.commons.compress.utils.IOUtils; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.*; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | import java.io.FileInputStream; 11 | import java.io.FileOutputStream; 12 | import java.io.IOException; 13 | import java.net.URI; 14 | import java.util.Iterator; 15 | import java.util.Map.Entry; 16 | import java.util.stream.Stream; 17 | 18 | @Slf4j 19 | public class HDFSClient { 20 | 21 | private FileSystem fileSystem = null; 22 | private Configuration configuration = null; 23 | 24 | private static final String DFS_PATH = "/hadoop/yarn-site.xml"; 25 | private static final String LOCAL_PATH = "src/main/java/com/tools/hadoop/config/yarn-site.xml"; 26 | 27 | @Before 28 | public void init() throws Exception { 29 | configuration = new Configuration(); 30 | 31 | // ******注意:以上配置都是由由就近原则进行配置调用****** 32 | // configuration.set > 自定义配置文件 > jar中配置文件 > 服务端配置 33 | fileSystem = FileSystem.get(new URI("hdfs://spark02:9000"), configuration, "hadoopadmin"); 34 | // 问题:配置HA,为什么在链接standby的namenode时会报错。 35 | 36 | } 37 | 38 | //流方式下载 39 | /** 40 | * upload file by stream 41 | */ 42 | @Test 43 | public void uploadByIO() throws IllegalArgumentException, IOException { 44 | FSDataOutputStream dataOutputStream = fileSystem.create(new Path(DFS_PATH), true); 45 | FileInputStream fileInputStream = new FileInputStream(LOCAL_PATH); 46 | IOUtils.copy(fileInputStream, dataOutputStream); 47 | } 48 | 49 | /** 50 | * download file by stream 51 | */ 52 | @Test 53 | public void downloadByIO() throws IllegalArgumentException, IOException { 54 | FSDataInputStream open = fileSystem.open(new Path(DFS_PATH)); 55 | FileOutputStream fileOutputStream = new FileOutputStream(LOCAL_PATH); 56 | IOUtils.copy(open, fileOutputStream); 57 | } 58 | 59 | /** 60 | * 通过流的方式将文件打印到屏幕上 61 | */ 62 | @Test 63 | public void downloadByIOToDisplay() throws IllegalArgumentException, IOException { 64 | FSDataInputStream open = fileSystem.open(new Path(DFS_PATH)); 65 | IOUtils.copy(open, System.out); 66 | } 67 | 68 | 69 | // client方式 70 | /** 71 | * download file 72 | */ 73 | @Test 74 | public void downloadCommand() throws IllegalArgumentException, IOException { 75 | //在文件拷贝与上传的过程中路径要精确到文件名 76 | fileSystem.copyToLocalFile(new Path(DFS_PATH), new Path(LOCAL_PATH)); 77 | fileSystem.close(); 78 | } 79 | 80 | 81 | /** 82 | * upload file 83 | */ 84 | @Test 85 | public void uploadCommand() { 86 | try { 87 | // hdfs路径要精确到文件名。 88 | fileSystem.copyFromLocalFile(new Path(LOCAL_PATH), new Path(DFS_PATH)); 89 | fileSystem.close(); 90 | } catch (IOException e) { 91 | log.error("upload file to hdfs failed :{}", e.getMessage()); 92 | } 93 | } 94 | 95 | /** 96 | * get cluster config 97 | */ 98 | @Test 99 | public void getConfiguration() { 100 | Iterator> iterator = configuration.iterator(); 101 | while (iterator.hasNext()) { 102 | System.out.println("name:" + iterator.next().getKey() + " ---- value:" + iterator.next().getValue()); 103 | } 104 | try { 105 | fileSystem.close(); 106 | } catch (IOException e) { 107 | log.error("fileSystem close error :{}", e.getMessage()); 108 | } 109 | } 110 | 111 | /** 112 | * mkdir on hdfs 113 | */ 114 | @Test 115 | public void mkdirOnHDFS() { 116 | try { 117 | Boolean mkdirRes = fileSystem.mkdirs(new Path(DFS_PATH)); 118 | fileSystem.close(); 119 | System.out.println(mkdirRes); 120 | } catch (IOException e) { 121 | log.error("make directory on hdfs failed :{}", e.getMessage()); 122 | } 123 | /** 124 | * output: 125 | * true/false 126 | */ 127 | } 128 | 129 | 130 | /** 131 | * delete file or directory 132 | */ 133 | @Test 134 | public void deleteFromHDFS() { 135 | try { 136 | System.out.println(fileSystem.delete(new Path(DFS_PATH), true)); 137 | fileSystem.close(); 138 | } catch (IOException e) { 139 | log.error("delete file or directory failed :{}", e.getMessage()); 140 | } 141 | /** 142 | * output: 143 | * true/false 144 | */ 145 | } 146 | 147 | 148 | /** 149 | * 150 | * get all file or dir at path 151 | */ 152 | @Test 153 | public void listFileFromHDFS(){ 154 | FileStatus[] fs = new FileStatus[0]; 155 | try { 156 | fs = fileSystem.listStatus(new Path("/hadoop")); 157 | } catch (IOException e) { 158 | log.error("get all file or dir error :{}",e.getMessage()); 159 | } 160 | Path[] listPath = FileUtil.stat2Paths(fs); 161 | Stream.of(listPath).forEach(path -> System.out.println(path.getName())); 162 | /** 163 | * output: 164 | * core-site.xml 165 | * hdfs-site.xml 166 | * */ 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/hdfs/HDFSStream.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.hdfs; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.commons.compress.utils.IOUtils; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FSDataInputStream; 7 | import org.apache.hadoop.fs.FSDataOutputStream; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | import java.io.FileInputStream; 14 | import java.io.FileOutputStream; 15 | import java.io.IOException; 16 | import java.net.URI; 17 | 18 | @Slf4j 19 | public class HDFSStream { 20 | 21 | private FileSystem fileSystem = null; 22 | 23 | private Configuration configuration = null; 24 | 25 | private static final String DFS_PATH = "/hadoop/yarn-site.xml"; 26 | private static final String LOCAL_PATH = "src/main/java/com/tools/hadoop/config/yarn-site.xml"; 27 | 28 | @Before 29 | public void init() throws Exception { 30 | configuration = new Configuration(); 31 | fileSystem = FileSystem.get(new URI("hdfs://spark02:9000"), configuration, "hadoopadmin"); 32 | } 33 | 34 | 35 | /** 36 | * upload file by stream 37 | */ 38 | @Test 39 | public void uploadByIO() throws IllegalArgumentException, IOException { 40 | FSDataOutputStream dataOutputStream = fileSystem.create(new Path(DFS_PATH), true); 41 | FileInputStream fileInputStream = new FileInputStream(LOCAL_PATH); 42 | IOUtils.copy(fileInputStream, dataOutputStream); 43 | } 44 | 45 | /** 46 | * download file by stream 47 | */ 48 | @Test 49 | public void downloadByIO() throws IllegalArgumentException, IOException { 50 | FSDataInputStream open = fileSystem.open(new Path(DFS_PATH)); 51 | FileOutputStream fileOutputStream = new FileOutputStream(LOCAL_PATH); 52 | IOUtils.copy(open, fileOutputStream); 53 | } 54 | 55 | /** 56 | * 通过流的方式将文件打印到屏幕上 57 | */ 58 | @Test 59 | public void downloadByIOToDisplay() throws IllegalArgumentException, IOException { 60 | FSDataInputStream open = fileSystem.open(new Path(DFS_PATH)); 61 | IOUtils.copy(open, System.out); 62 | } 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/hdfs/UploadFileToHdfsByCompress.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.hdfs; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.fs.FSDataOutputStream; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.IOUtils; 9 | import org.apache.hadoop.io.compress.BZip2Codec; 10 | import org.apache.hadoop.io.compress.CompressionOutputStream; 11 | import org.junit.Test; 12 | 13 | import java.io.BufferedInputStream; 14 | import java.io.FileInputStream; 15 | import java.io.IOException; 16 | import java.net.URI; 17 | 18 | @Slf4j 19 | public class UploadFileToHdfsByCompress { 20 | 21 | private static final String DFS_PATH = "/hadoop/yarn-site.xml"; 22 | private static final String LOCAL_PATH = "src/main/java/com/tools/hadoop/config/yarn-site.xml"; 23 | private static final String url = "hdfs://spark02:9000"; 24 | 25 | @Test 26 | public void uploadByCompress() { 27 | Configuration configuration = new Configuration(); 28 | BZip2Codec codec = new BZip2Codec(); 29 | codec.setConf(configuration); 30 | try { 31 | BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(LOCAL_PATH)); 32 | FileSystem fileSystem = FileSystem.get(URI.create(url), configuration, "hadoopadmin"); 33 | FSDataOutputStream outputStream = fileSystem.create(new Path(DFS_PATH)); 34 | 35 | CompressionOutputStream codecOutputStream = codec.createOutputStream(outputStream); 36 | IOUtils.copyBytes(inputStream, codecOutputStream, configuration); 37 | log.info("upload success, local path: {}, hdfs path: {}", LOCAL_PATH, DFS_PATH); 38 | } catch (InterruptedException | IOException e) { 39 | log.error("upload error:{}", e.getMessage()); 40 | } 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/HFileGenerator.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr; 2 | 3 | import com.tools.hbase.Utils; 4 | import lombok.extern.slf4j.Slf4j; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.hbase.TableName; 9 | import org.apache.hadoop.hbase.client.Connection; 10 | import org.apache.hadoop.hbase.client.ConnectionFactory; 11 | import org.apache.hadoop.hbase.client.Table; 12 | import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2; 13 | import org.apache.hadoop.mapred.FileInputFormat; 14 | import org.apache.hadoop.mapred.FileOutputFormat; 15 | import org.apache.hadoop.mapred.JobConf; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 18 | 19 | import java.io.IOException; 20 | import java.net.URI; 21 | import java.util.Objects; 22 | import java.util.UUID; 23 | 24 | /** 25 | * bulk load like a ETL 26 | * - Extract: from text file or another database into HDFS 27 | * - Transform: data into HFile(HBase's own file format) 28 | * - Load: load the HFile into HBase and tell region server where to find them 29 | */ 30 | 31 | @Slf4j 32 | public class HFileGenerator { 33 | 34 | public static void main(String[] args) { 35 | if (args.length < 4) { 36 | System.err.println("Usage:hadoop jar HFileGenerator.jar inputPath outputPath tableName configPath"); 37 | System.exit(0); 38 | } 39 | Job job = createJob(args[0], args[1], args[2], args[3]); 40 | if (Objects.isNull(job)) { 41 | log.error("error in create job!"); 42 | } 43 | try { 44 | if (job.waitForCompletion(true)) { 45 | log.info("execute job finish!"); 46 | Utils.doBulkLoad(job.getConfiguration(), args[1], args[2]); 47 | } else { 48 | log.error("execute job failed!!"); 49 | } 50 | } catch (IOException | InterruptedException | ClassNotFoundException e) { 51 | e.printStackTrace(); 52 | } 53 | 54 | } 55 | 56 | public static Job createJob(String inputPath, String outputPath, String tableName, String configPath) { 57 | Configuration configuration = new Configuration(); 58 | configuration.addResource(new Path(configPath)); 59 | configuration.set("hbase.fs.tmp.dir", "partition_" + UUID.randomUUID()); 60 | Job job = null; 61 | try { 62 | try { 63 | FileSystem fileSystem = FileSystem.get(URI.create(outputPath), configuration); 64 | fileSystem.delete(new Path(outputPath), true); 65 | fileSystem.close(); 66 | } catch (IOException e) { 67 | e.printStackTrace(); 68 | } 69 | Connection connection = ConnectionFactory.createConnection(configuration); 70 | Table table = connection.getTable(TableName.valueOf(tableName)); 71 | job = Job.getInstance(configuration); 72 | job.setJobName("HFileGenerator Job"); 73 | 74 | job.setJarByClass(HFileGenerator.class); 75 | job.setOutputFormatClass(TextOutputFormat.class); 76 | job.setMapperClass(HFileImportMapper.class); 77 | FileInputFormat.setInputPaths(new JobConf(configuration), inputPath); 78 | FileOutputFormat.setOutputPath(new JobConf(configuration), new Path(outputPath)); 79 | 80 | HFileOutputFormat2.configureIncrementalLoad(job, table, connection.getRegionLocator(TableName.valueOf(tableName))); 81 | } catch (Exception e) { 82 | 83 | } 84 | return job; 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/HFileImportMapper.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.hadoop.hbase.KeyValue; 5 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 6 | import org.apache.hadoop.hbase.util.Bytes; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | import java.io.IOException; 12 | import java.time.LocalDate; 13 | import java.time.format.DateTimeFormatter; 14 | 15 | @Slf4j 16 | public class HFileImportMapper extends Mapper { 17 | 18 | protected final String CF_KQ = "cf"; 19 | 20 | @Override 21 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 22 | String line = value.toString(); 23 | log.info("read line: {}", line); 24 | String[] strings = line.split(" "); 25 | String row = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE) + "_" + strings[1]; 26 | ImmutableBytesWritable writable = new ImmutableBytesWritable(Bytes.toBytes(row)); 27 | KeyValue keyValue = new KeyValue(Bytes.toBytes(row), this.CF_KQ.getBytes(), strings[1].getBytes(), strings[2].getBytes()); 28 | context.write(writable, keyValue); 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/dataclean/DataClean.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.dataclean; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | import org.apache.hadoop.mapreduce.Counter; 11 | import org.apache.hadoop.mapreduce.Job; 12 | import org.apache.hadoop.mapreduce.Mapper; 13 | 14 | import java.io.IOException; 15 | 16 | public class DataClean { 17 | /** 18 | * 19 | * 注意:若要IDEA中,本地运行MR程序,需要将resources/mapred-site.xml中的mapreduce.framework.name属性值,设置成local 20 | * @param args 21 | */ 22 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 23 | 24 | //判断一下,输入参数是否是两个,分别表示输入路径、输出路径 25 | if (args == null || args.length != 2) { 26 | System.out.println("please input Path!"); 27 | System.exit(0); 28 | } 29 | 30 | Configuration configuration = new Configuration(); 31 | 32 | //调用getInstance方法,生成job实例 33 | Job job = Job.getInstance(configuration, DataClean.class.getSimpleName()); 34 | 35 | //设置jar包,参数是包含main方法的类 36 | job.setJarByClass(DataClean.class); 37 | 38 | //设置输入/输出路径 39 | FileInputFormat.setInputPaths(job, new Path(args[0])); 40 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 41 | 42 | //设置处理Map阶段的自定义的类 43 | job.setMapperClass(DataCleanMapper.class); 44 | 45 | //注意:此处设置的map输出的key/value类型,一定要与自定义map类输出的kv对类型一致;否则程序运行报错 46 | job.setMapOutputKeyClass(Text.class); 47 | job.setMapOutputValueClass(NullWritable.class); 48 | 49 | //注意:因为不需要reduce聚合阶段,所以,需要显示设置reduce task个数是0 50 | job.setNumReduceTasks(0); 51 | 52 | // 提交作业 53 | System.exit(job.waitForCompletion(true) ? 0 : 1); 54 | } 55 | 56 | /** 57 | * 58 | * 自定义mapper类 59 | * 注意:若自定义的mapper类,与main方法在同一个类中,需要将自定义mapper类,声明成static的 60 | */ 61 | public static class DataCleanMapper extends Mapper { 62 | NullWritable nullValue = NullWritable.get(); 63 | 64 | @Override 65 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 66 | //自定义计数器,用于记录残缺记录数 67 | Counter counter = context.getCounter("DataCleaning", "damagedRecord"); 68 | 69 | //获得当前行数据 70 | //样例数据:20111230111645 169796ae819ae8b32668662bb99b6c2d 塘承高速公路规划线路图 1 1 http://auto.ifeng.com/roll/20111212/729164.shtml 71 | String line = value.toString(); 72 | 73 | String[] fields = line.split("\t"); 74 | 75 | if(fields.length != 6) { 76 | //若不是,则不输出,并递增自定义计数器 77 | counter.increment(1L); 78 | } else { 79 | //若是6,则原样输出 80 | context.write(value, nullValue); 81 | } 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/duplicate/DuplicateRemoval.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.duplicate; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import java.io.IOException; 16 | 17 | /** 18 | * Title:MR 19 | * description: 20 | * 21 | * @author: Lijiaqi 22 | * @version: 1.0 23 | * @create: 2018-10-29 16:20 24 | **/ 25 | 26 | public class DuplicateRemoval { 27 | private final static Logger logger = LoggerFactory.getLogger(DuplicateRemoval.class); 28 | 29 | /** 30 | * map将输入中的value复制到输出数据的key上,并直接输出 31 | */ 32 | public static class Map extends Mapper { 33 | 34 | /** 35 | * 实现map函数 36 | */ 37 | @Override 38 | public void map(Object key, Text value, Context context) 39 | 40 | throws IOException, InterruptedException { 41 | context.write(value, new Text("")); 42 | } 43 | } 44 | 45 | /** 46 | * reduce将输入中的key复制到输出数据的key上,并直接输出 47 | */ 48 | public static class Reduce extends Reducer { 49 | /** 50 | * 实现reduce函数 51 | */ 52 | @Override 53 | public void reduce(Text key, Iterable values, Context context) 54 | throws IOException, InterruptedException { 55 | context.write(key, new Text("")); 56 | } 57 | } 58 | 59 | /** 60 | * 入口方法 61 | */ 62 | public static void main(String[] args) throws Exception { 63 | 64 | //默认不做任何配置,都由配置文件中加载 65 | Configuration conf = new Configuration(); 66 | /** 67 | * 默认入口携带第一个参数为输入路径,第二个为输出路径 68 | */ 69 | Path path = new Path(args[1]); 70 | 71 | //从配置文件中获取当前文件系统,判断是os还是hdfs 72 | FileSystem fs = FileSystem.get(conf); 73 | 74 | if (fs.exists(path)) { 75 | logger.error("Usage: Data Deduplication "); 76 | //当输出文件已经存在时,进行删除 77 | fs.delete(path, true); 78 | System.exit(2); 79 | } 80 | 81 | //Job job = new Job(conf);//已经不推荐使用 82 | //推荐使用当前方式 83 | Job job = Job.getInstance(conf, "DuplicateRemoval"); 84 | job.setJarByClass(DuplicateRemoval.class); 85 | 86 | //设置输入和输出目录 87 | FileInputFormat.setInputPaths(job, new Path(args[0])); 88 | FileOutputFormat.setOutputPath(job, path); 89 | 90 | //设置Map处理类 91 | job.setMapperClass(Map.class); 92 | job.setMapOutputKeyClass(Text.class); 93 | job.setMapOutputValueClass(Text.class); 94 | 95 | //设置Combine和Reduce处理类 96 | job.setCombinerClass(Reduce.class); 97 | job.setReducerClass(Reduce.class); 98 | 99 | //设置输出类型 100 | job.setOutputKeyClass(Text.class); 101 | job.setOutputValueClass(Text.class); 102 | 103 | System.exit(job.waitForCompletion(true) ? 0 : 1); 104 | 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/inputformat/SmallFiles2SequenceFile.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.inputformat; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.BytesWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 10 | import org.apache.hadoop.util.Tool; 11 | import org.apache.hadoop.util.ToolRunner; 12 | 13 | public class SmallFiles2SequenceFile extends Configured implements Tool { 14 | 15 | public static void main(String[] args) throws Exception { 16 | int code = ToolRunner.run(new SmallFiles2SequenceFile(), args); 17 | System.exit(code); 18 | } 19 | 20 | @Override 21 | public int run(String[] strings) throws Exception { 22 | Configuration configuration = new Configuration(); 23 | 24 | configuration.set("mapreduce.map.output.compress", "true"); 25 | //设置map输出的压缩算法是:BZip2Codec,它是hadoop默认支持的压缩算法,且支持切分 26 | configuration.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); 27 | //开启job输出压缩功能 28 | configuration.set("mapreduce.output.fileoutputformat.compress", "true"); 29 | //指定job输出使用的压缩算法 30 | configuration.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); 31 | 32 | Job job = Job.getInstance(configuration, SmallFiles2SequenceFile.class.getName()); 33 | 34 | job.setJarByClass(SmallFiles2SequenceFile.class); 35 | 36 | job.setMapperClass(SmallFiles2SequenceMapper.class); 37 | 38 | job.setInputFormatClass(WholeFileInputFormat.class); 39 | 40 | WholeFileInputFormat.addInputPath(job, new Path(strings[0])); 41 | 42 | job.setOutputFormatClass(SequenceFileOutputFormat.class); 43 | 44 | SequenceFileOutputFormat.setOutputPath(job, new Path(strings[1])); 45 | 46 | job.setOutputKeyClass(Text.class); 47 | job.setOutputValueClass(BytesWritable.class); 48 | 49 | return job.waitForCompletion(true) ? 0 : 1; 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/inputformat/SmallFiles2SequenceMapper.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.inputformat; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.BytesWritable; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 10 | 11 | import java.io.IOException; 12 | 13 | public class SmallFiles2SequenceMapper extends Mapper { 14 | 15 | private Text filenameKey; 16 | 17 | @Override 18 | protected void setup(Context context) { 19 | InputSplit inputSplit = context.getInputSplit(); 20 | Path path = ((FileSplit) inputSplit).getPath(); 21 | filenameKey = new Text(path.toString()); 22 | } 23 | 24 | @Override 25 | protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException { 26 | context.write(new Text(filenameKey), value); 27 | } 28 | 29 | @Override 30 | protected void cleanup(Context context) throws IOException, InterruptedException { 31 | super.cleanup(context); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/inputformat/WholeFileInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.inputformat; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.mapreduce.InputSplit; 5 | import org.apache.hadoop.mapreduce.JobContext; 6 | import org.apache.hadoop.mapreduce.RecordReader; 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | 10 | 11 | /** 12 | * 自定义inputformat实现先文件合并 13 | */ 14 | public class WholeFileInputFormat extends FileInputFormat { 15 | @Override 16 | protected boolean isSplitable(JobContext context, Path filename) { 17 | return false; 18 | } 19 | 20 | @Override 21 | public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) { 22 | WholeRecordReader wholeRecordReader = new WholeRecordReader(); 23 | wholeRecordReader.initialize(inputSplit, taskAttemptContext); 24 | return wholeRecordReader; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/inputformat/WholeRecordReader.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.inputformat; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.io.IOUtils; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.mapreduce.InputSplit; 11 | import org.apache.hadoop.mapreduce.RecordReader; 12 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 13 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 14 | 15 | import java.io.IOException; 16 | 17 | public class WholeRecordReader extends RecordReader { 18 | 19 | private FileSplit fileSplit; 20 | 21 | private Configuration configuration; 22 | 23 | private BytesWritable value = new BytesWritable(); 24 | 25 | private boolean processed = false; 26 | 27 | 28 | @Override 29 | public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) { 30 | this.fileSplit = (FileSplit) inputSplit; 31 | this.configuration = taskAttemptContext.getConfiguration(); 32 | } 33 | 34 | @Override 35 | public boolean nextKeyValue() throws IOException, InterruptedException { 36 | if (!processed) { 37 | byte[] bytes = new byte[(int) fileSplit.getLength()]; 38 | Path path = fileSplit.getPath(); 39 | FileSystem fileSystem = path.getFileSystem(configuration); 40 | FSDataInputStream open = fileSystem.open(path); 41 | IOUtils.readFully(open, bytes, 0, bytes.length); 42 | value.set(bytes, 0, bytes.length); 43 | IOUtils.closeStream(open); 44 | processed = true; 45 | return true; 46 | } 47 | return false; 48 | } 49 | 50 | @Override 51 | public NullWritable getCurrentKey() throws IOException, InterruptedException { 52 | return NullWritable.get(); 53 | } 54 | 55 | @Override 56 | public BytesWritable getCurrentValue() throws IOException, InterruptedException { 57 | return value; 58 | } 59 | 60 | @Override 61 | public float getProgress() throws IOException, InterruptedException { 62 | return processed ? 1.0f : 0.0f; 63 | } 64 | 65 | @Override 66 | public void close() throws IOException { 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/outputformat/MyFileOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.outputformat; 2 | 3 | import org.apache.hadoop.fs.FSDataOutputStream; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.RecordWriter; 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | import java.io.IOException; 13 | 14 | public class MyFileOutputFormat extends FileOutputFormat { 15 | //与reduce的输出泛型一致 16 | 17 | 18 | @Override 19 | public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { 20 | FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration()); 21 | 22 | String bad = "hdfs://spark01:8020/outputformat/good/r.txt"; 23 | Path badPath = new Path(bad); 24 | String good = "hdfs://spark01:8020/outputformat/bad/r.txt"; 25 | Path goodPath = new Path(good); 26 | FSDataOutputStream badStream = fileSystem.create(badPath); 27 | FSDataOutputStream goodStream = fileSystem.create(goodPath); 28 | return new MyRecordWriter(badStream, goodStream); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/outputformat/MyFileOutputFormatMain.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.outputformat; 2 | 3 | 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.conf.Configured; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | import java.io.IOException; 16 | 17 | public class MyFileOutputFormatMain extends Configured implements Tool { 18 | @Override 19 | public int run(String[] strings) throws Exception { 20 | 21 | Configuration configuration = new Configuration(); 22 | 23 | Job job = Job.getInstance(configuration, MyFileOutputFormatMain.class.getName()); 24 | 25 | 26 | job.setJarByClass(MyFileOutputFormatMain.class); 27 | 28 | 29 | TextInputFormat.addInputPath(job, new Path(strings[0])); 30 | MyFileOutputFormat.setOutputPath(job, new Path(strings[1])); 31 | job.setMapperClass(MyFileOutputFormatMapper.class); 32 | job.setOutputFormatClass(MyFileOutputFormat.class); 33 | 34 | job.setOutputKeyClass(Text.class); 35 | job.setOutputValueClass(NullWritable.class); 36 | 37 | return job.waitForCompletion(true) ? 0 : 1; 38 | 39 | 40 | } 41 | 42 | public static void main(String[] args) throws Exception { 43 | ToolRunner.run(new MyFileOutputFormatMain(), args); 44 | } 45 | 46 | private static class MyFileOutputFormatMapper extends Mapper { 47 | @Override 48 | protected void map(Object key, Object value, Context context) throws IOException, InterruptedException { 49 | context.write(value, NullWritable.get()); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/outputformat/MyRecordWriter.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.outputformat; 2 | 3 | import org.apache.hadoop.fs.FSDataOutputStream; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.RecordWriter; 7 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 8 | 9 | import java.io.IOException; 10 | 11 | public class MyRecordWriter extends RecordWriter { 12 | FSDataOutputStream badOut; 13 | FSDataOutputStream goodOut; 14 | 15 | public MyRecordWriter(FSDataOutputStream badOut, FSDataOutputStream goodOut) { 16 | this.badOut = badOut; 17 | this.goodOut = goodOut; 18 | } 19 | 20 | @Override 21 | public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException { 22 | if (text.toString().split("\t")[9].equals("0")) { 23 | goodOut.write(text.toString().getBytes()); 24 | goodOut.write("\r\n".getBytes()); 25 | } else { 26 | badOut.write(text.toString().getBytes()); 27 | badOut.write("\r\n".getBytes()); 28 | } 29 | } 30 | 31 | @Override 32 | public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { 33 | if (goodOut != null) { 34 | goodOut.close(); 35 | } 36 | if (badOut != null) { 37 | badOut.close(); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/MyOrder.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | import org.apache.hadoop.util.Tool; 12 | import org.apache.hadoop.util.ToolRunner; 13 | 14 | /** 15 | * 16 | */ 17 | public class MyOrder extends Configured implements Tool { 18 | 19 | 20 | @Override 21 | public int run(String[] strings) throws Exception { 22 | 23 | Configuration configuration = new Configuration(); 24 | 25 | Job job = Job.getInstance(configuration, MyOrder.class.getName()); 26 | 27 | FileInputFormat.setInputPaths(job, new Path(strings[0])); 28 | FileOutputFormat.setOutputPath(job, new Path(strings[1])); 29 | 30 | job.setJarByClass(MyOrder.class); 31 | 32 | job.setMapperClass(MyOrderMapper.class); 33 | job.setReducerClass(MyOrderReducer.class); 34 | 35 | job.setGroupingComparatorClass(MyOrderGroup.class); 36 | 37 | //如果map、reduce的输出的kv对类型一致,直接设置reduce的输出的kv对就行;如果不一样,需要分别设置map, reduce的输出的kv类型 38 | //注意:此处设置的map输出的key/value类型,一定要与自定义map类输出的kv对类型一致;否则程序运行报错 39 | job.setMapOutputKeyClass(OrderBean.class); 40 | job.setMapOutputValueClass(DoubleWritable.class); 41 | 42 | //设置reduce task最终输出key/value的类型 43 | //注意:此处设置的reduce输出的key/value类型,一定要与自定义reduce类输出的kv对类型一致;否则程序运行报错 44 | job.setOutputKeyClass(Text.class); 45 | job.setOutputValueClass(DoubleWritable.class); 46 | 47 | return job.waitForCompletion(true)?0:1; 48 | } 49 | 50 | public static void main(String[] args) throws Exception { 51 | System.exit(ToolRunner.run(new MyOrder(), args)); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/MyOrderGroup.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | import org.apache.hadoop.io.WritableComparator; 5 | 6 | public class MyOrderGroup extends WritableComparator { 7 | 8 | public MyOrderGroup(){ 9 | // 标识当前的key为orderbean 10 | super(OrderBean.class,true); 11 | } 12 | 13 | @Override 14 | public int compare(WritableComparable a, WritableComparable b) { 15 | OrderBean aOrderBean = (OrderBean)a; 16 | OrderBean bOrderBean = (OrderBean)b; 17 | 18 | String aUserId = aOrderBean.getUserid(); 19 | String bUserId = bOrderBean.getUserid(); 20 | //userid、年、月相同的,作为一组 21 | int ret1 = aUserId.compareTo(bUserId); 22 | if(ret1 == 0) {//同一用户 23 | //年月也相同返回0,在同一组; 24 | return aOrderBean.getDatetime().compareTo(bOrderBean.getDatetime()); 25 | } else { 26 | return ret1; 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/MyOrderMapper.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import org.apache.hadoop.io.DoubleWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import java.io.IOException; 9 | import java.time.LocalDateTime; 10 | import java.time.format.DateTimeFormatter; 11 | import java.util.Optional; 12 | 13 | import static java.time.LocalDateTime.parse; 14 | 15 | 16 | /** 17 | * 泛型描述

输入key 输入value 输出key 输出value 18 | */ 19 | public class MyOrderMapper extends Mapper { 20 | 21 | @Override 22 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 23 | 24 | String[] values = Optional.ofNullable(value).map(v -> v.toString().split("\t")).get(); 25 | String yearMonthString; 26 | try { 27 | yearMonthString = getYearMonthString(values[1], "yyyy-MM-dd HH:mm:ss.SSS"); 28 | }catch (Exception e){ 29 | return; 30 | } 31 | if (values.length == 6){ 32 | //13764633023 2014-12-01 02:20:42.000 全视目Allseelook 原宿风暴显色美瞳彩色隐形艺术眼镜1片 拍2包邮 33.6 2 18067781305 33 | OrderBean orderBean = new OrderBean(values[0], 34 | yearMonthString, 35 | values[2], 36 | Double.parseDouble(values[3]), 37 | Integer.parseInt(values[4]), 38 | values[5]); 39 | 40 | DoubleWritable doubleWritable = new DoubleWritable(); 41 | doubleWritable.set(Double.parseDouble(values[3])*Integer.parseInt(values[4])); 42 | context.write(orderBean, doubleWritable); 43 | } 44 | } 45 | 46 | public static String getYearMonthString(String dateTime, String pattern) { 47 | DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); 48 | LocalDateTime localDateTime = parse(dateTime, formatter); 49 | int year = localDateTime.getYear(); 50 | int month = localDateTime.getMonthValue(); 51 | return year + "" + month; 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/MyOrderReducer.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import org.apache.hadoop.io.DoubleWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import java.io.IOException; 8 | 9 | public class MyOrderReducer extends Reducer { 10 | 11 | @Override 12 | protected void reduce(OrderBean key, Iterable values, Context context) throws IOException, InterruptedException { 13 | int num = 0; 14 | for(DoubleWritable value: values) { 15 | if(num < 2) { 16 | String keyOut = key.getUserid() + "-----" + key.getDatetime(); 17 | context.write(new Text(keyOut), value); 18 | num++; 19 | } else { 20 | break; 21 | } 22 | } 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/MyPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import org.apache.hadoop.io.DoubleWritable; 4 | import org.apache.hadoop.mapreduce.Partitioner; 5 | 6 | //mapper的输出key类型是自定义的key类型OrderBean;输出value类型是单笔订单的总开销double -> DoubleWritable 7 | public class MyPartitioner extends Partitioner { 8 | @Override 9 | public int getPartition(OrderBean orderBean, DoubleWritable doubleWritable, int numReduceTasks) { 10 | //userid相同的,落入同一分区 11 | return (orderBean.getUserid().hashCode() & Integer.MAX_VALUE) % numReduceTasks; 12 | } 13 | } -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/secondarysort/OrderBean.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.secondarysort; 2 | 3 | import lombok.Data; 4 | import org.apache.hadoop.io.WritableComparable; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | @Data 11 | public class OrderBean implements WritableComparable { 12 | 13 | //用户ID 14 | private String userid; 15 | //年月 16 | //year+month -> 201408 17 | private String datetime; 18 | //标题 19 | private String title; 20 | //单价 21 | private double unitPrice; 22 | //购买量 23 | private int purchaseNum; 24 | //商品ID 25 | private String produceId; 26 | 27 | public OrderBean() { 28 | } 29 | 30 | public OrderBean(String userid, String datetime, String title, double unitPrice, int purchaseNum, String produceId) { 31 | super(); 32 | this.userid = userid; 33 | this.datetime = datetime; 34 | this.title = title; 35 | this.unitPrice = unitPrice; 36 | this.purchaseNum = purchaseNum; 37 | this.produceId = produceId; 38 | } 39 | 40 | //key的比较规则 41 | public int compareTo(OrderBean other) { 42 | //OrderBean作为MR中的key;如果对象中的userid相同,即ret1为0;就表示两个对象是同一个用户 43 | int isEquals = this.userid.compareTo(other.userid); 44 | 45 | if (isEquals == 0) { 46 | //如果userid相同,比较年月 47 | String thisYearMonth = this.getDatetime(); 48 | String otherYearMonth = other.getDatetime(); 49 | int isEqualsWithDate = thisYearMonth.compareTo(otherYearMonth); 50 | 51 | if(isEqualsWithDate == 0) {//若datetime相同 52 | //如果userid、年月都相同,比较单笔订单的总开销 53 | Double thisTotalPrice = this.getPurchaseNum()*this.getUnitPrice(); 54 | Double oTotalPrice = other.getPurchaseNum()*other.getUnitPrice(); 55 | //总花销降序排序;即总花销高的排在前边 56 | return -thisTotalPrice.compareTo(oTotalPrice); 57 | } else { 58 | //若datatime不同,按照datetime升序排序 59 | return isEqualsWithDate; 60 | } 61 | } else { 62 | //按照userid升序排序 63 | return isEquals; 64 | } 65 | } 66 | 67 | /** 68 | * 序列化 69 | * @param dataOutput 70 | * @throws IOException 71 | */ 72 | public void write(DataOutput dataOutput) throws IOException { 73 | dataOutput.writeUTF(userid); 74 | dataOutput.writeUTF(datetime); 75 | dataOutput.writeUTF(title); 76 | dataOutput.writeDouble(unitPrice); 77 | dataOutput.writeInt(purchaseNum); 78 | dataOutput.writeUTF(produceId); 79 | } 80 | 81 | /** 82 | * 反序列化 83 | * @param dataInput 84 | * @throws IOException 85 | */ 86 | public void readFields(DataInput dataInput) throws IOException { 87 | this.userid = dataInput.readUTF(); 88 | this.datetime = dataInput.readUTF(); 89 | this.title = dataInput.readUTF(); 90 | this.unitPrice = dataInput.readDouble(); 91 | this.purchaseNum = dataInput.readInt(); 92 | this.produceId = dataInput.readUTF(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/LogCount.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | public class LogCount { 13 | 14 | public static void main(String[] args) throws Exception { 15 | 16 | // 读取配置文件 17 | Configuration conf = new Configuration(); 18 | 19 | Path out = new Path(args[1]); 20 | FileSystem fs = FileSystem.get(conf); 21 | 22 | //判断输出路径是否存在,当路径存在时mapreduce会报错 23 | if (fs.exists(out)) { 24 | fs.delete(out, true); 25 | System.out.println("ouput is exit will delete"); 26 | } 27 | 28 | // 创建任务 29 | Job job = Job.getInstance(conf, LogCount.class.getName()); 30 | // 设置job的主类 31 | job.setJarByClass(LogCount.class); // 主类 32 | 33 | // 设置作业的输入路径 34 | FileInputFormat.setInputPaths(job, new Path(args[0])); 35 | 36 | //设置map的相关参数 37 | job.setMapperClass(LogCountMapper.class); 38 | 39 | //设置reduce相关参数 40 | job.setReducerClass(LogCountReducer.class); 41 | job.setOutputKeyClass(Text.class); 42 | job.setOutputValueClass(LongWritable.class); 43 | 44 | //设置作业的输出路径 45 | FileOutputFormat.setOutputPath(job, out); 46 | 47 | job.setNumReduceTasks(2); 48 | 49 | 50 | System.exit(job.waitForCompletion(true) ? 0 : 1); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/LogCountMapper.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | import java.io.IOException; 8 | 9 | 10 | public class LogCountMapper extends Mapper { 11 | /** 12 | * 读取输入文件 13 | */ 14 | @Override 15 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 16 | String line = value.toString(); 17 | String[] words = line.split(" "); 18 | for (String word : words) { 19 | //通过上下文将结果输出 20 | context.write(new Text(word), new LongWritable(1L)); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/LogCountReducer.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import java.io.IOException; 8 | 9 | public class LogCountReducer extends Reducer { 10 | /** 11 | * 12 | */ 13 | @Override 14 | protected void reduce(Text key, Iterable values, Context context) 15 | throws IOException, InterruptedException { 16 | Long count = 0L; 17 | for (LongWritable value : values) { 18 | count += value.get(); 19 | } 20 | //统计结果的输出 21 | context.write(key, new LongWritable(count)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/MyPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Partitioner; 6 | 7 | /** 8 | * 自定义partitioner 9 | */ 10 | public class MyPartitioner extends Partitioner { 11 | 12 | 13 | /** 14 | * 15 | * 重写方法 16 | * 17 | * @param arg0 输入数据 18 | * @param arg1 19 | * @param arg2 20 | * @return 返回值为分区序号 21 | */ 22 | @Override 23 | public int getPartition(Text arg0, LongWritable arg1, int arg2) { 24 | 25 | 26 | if (arg0.toString().equals("hadoop")) { 27 | return 0; 28 | } 29 | if (arg0.toString().equals("spark")) { 30 | return 1; 31 | } 32 | if (arg0.toString().equals("hbase")) { 33 | return 2; 34 | } 35 | return 3; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/WordCountByPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | 12 | public class WordCountByPartitioner { 13 | /** 14 | * @Title: main @Description: 定义的driver:封装了mapreduce作业的所有信息 @param @param 15 | * args @param @throws Exception @return void @throws 16 | */ 17 | public static void main(String[] args) throws Exception { 18 | 19 | // 设置环境变量HADOOP_USER_NAME,其值是root 20 | // 在本机调试 21 | // 读取配置文件 22 | Configuration conf = new Configuration(); 23 | conf.set("fs.defaultFS", "hdfs://spark01:9000"); 24 | conf.set("yarn.resourcemanager.hostname", "spark01"); 25 | 26 | 27 | /** 28 | * MR压缩相关 29 | * 在mr中为了减少磁盘和网络io同时可以开启压缩 30 | */ 31 | //map端压缩 32 | conf.set("mapreduce.map.output.compress", "true"); 33 | conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); 34 | //输出端压缩 35 | conf.set("mapreduce.output.fileoutputformat.compress", "true"); 36 | conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); 37 | 38 | Path out = new Path(args[1]); 39 | FileSystem fs = FileSystem.get(conf); 40 | 41 | //判断输出路径是否存在,当路径存在时mapreduce会报错 42 | if (fs.exists(out)) { 43 | fs.delete(out, true); 44 | System.out.println("ouput is exit will delete"); 45 | } 46 | 47 | // 创建任务 48 | Job job = Job.getInstance(conf, WordCountByPartitioner.class.getName()); 49 | // 设置job的主类 50 | job.setJarByClass(WordCountByPartitioner.class); // 主类 51 | 52 | // 设置作业的输入路径 53 | FileInputFormat.setInputPaths(job, new Path(args[0])); 54 | 55 | //设置map的相关参数 56 | job.setMapperClass(WordCountMapper.class); 57 | 58 | 59 | /** 60 | * 需要注意的事Combiner就是Reducer,他相当于在map端进行的一个reducer,以便于减少网络io 61 | * - 使用combine时,首先考虑当前MR是否适合combine 62 | * - 总原则是不论使不使用combine不能影响最终的结果 63 | * - 在MR时,发生数据倾斜,且可以使用combine时,可以使用combine缓解数据倾斜 64 | */ 65 | job.setCombinerClass(WordCountReducer.class); 66 | 67 | //设置reduce相关参数 68 | job.setReducerClass(WordCountReducer.class); 69 | job.setOutputKeyClass(Text.class); 70 | job.setOutputValueClass(LongWritable.class); 71 | 72 | // 设置自定义partitioner 73 | job.setPartitionerClass(MyPartitioner.class); 74 | 75 | //设置作业的输出路径 76 | FileOutputFormat.setOutputPath(job, out); 77 | 78 | //设置reduce为4 否则partitioner不会生效 79 | job.setNumReduceTasks(4); 80 | 81 | 82 | System.exit(job.waitForCompletion(true) ? 0 : 1); 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/WordCountMapper.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Mapper; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * @author LiJiaqi 11 | * @ClassName: WordCountMapper 12 | * @Description:使用mapreduce开发wordcount程序 13 | * @date 2018年8月22日 下午11:15:54 14 | */ 15 | public class WordCountMapper extends Mapper { 16 | /** 17 | * 读取输入文件 18 | */ 19 | @Override 20 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 21 | //按行读取文件 22 | String line = value.toString(); 23 | // 20111230115903 262c9791427904631304a5eea4484bd5 音乐 4 1 http://mp3.baidu.com/ 24 | String[] words = line.split(" "); 25 | // 通过上下文将结果输出 userid 262c9791427904631304a5eea4484bd5 26 | context.write(new Text(words[1]), new LongWritable(Long.parseLong(words[1]))); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/mr/wordcount/WordCountReducer.java: -------------------------------------------------------------------------------- 1 | package com.tools.hadoop.mr.wordcount; 2 | 3 | import org.apache.hadoop.io.LongWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import java.io.IOException; 8 | 9 | public class WordCountReducer extends Reducer { 10 | /** 11 | * 12 | */ 13 | @Override 14 | protected void reduce(Text key, Iterable values, Context context) 15 | throws IOException, InterruptedException { 16 | Long count = 0L; 17 | for (LongWritable value : values) { 18 | count += value.get(); 19 | } 20 | //统计结果的输出 21 | context.write(key, new LongWritable(count)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hadoop/集群部署文档.md: -------------------------------------------------------------------------------- 1 | # 部署文档 2 | 3 | ## 服务器集群 4 | 5 | hostname | IP | user | password | path | os | 6 | ---------|----|-------|---------|-------|-----| 7 | node01 | 192.168.218.110 | hadoop | java | /hadoop | Centos7 8 | node02 | 192.168.218.120 | hadoop | java | /hadoop | Centos7 9 | node03 |192.168.218.130| hadoop | java | /hadoop | Centos7 10 | 11 | ## 集群规划 12 | 13 | node01 | node02 | node03 14 | ---------|---------|-------| 15 | namenode | namenode | 16 | datanode | datanode | datanode 17 | zookeeper | zookeeper | zookeeper 18 | ResourceManage | ResourceManage 19 | NodeManage | NodeManage | NodeManage 20 | JournalNode | JournalNode | JournalNode 21 | DFSZKFailoverController | DFSZKFailoverController | 22 | 23 | ## 组件版本 24 | 25 | 组件 | 版本 | 下载地址 26 | -----|------|------| 27 | Centos7 | CentOS-7-x86_64-DVD-1908.iso | [linux服务器下载地址](https://mirrors.aliyun.com/centos/7/isos/x86_64/CentOS-7-x86_64-DVD-1908.iso) 28 | JDK | jdk1.8.0_141 | [Jdk](https://www.oracle.com/technetwork/cn/java/javase/downloads/jdk8-downloads-2133151-zhs.html) 29 | Zookeeper | zookeeper-3.4.5-cdh5.14.2 | [zookeeper下载地址](https://www.baidu.com) 30 | Hadoop | hadoop-2.6.0-cdh5.14.2 | [hadoop下载地址](http://archive.cloudera.com/cdh5/cdh/5/) 31 | 32 | ## 虚拟机安装跳过 33 | 34 | ## 网卡配置 35 | 36 | mac参考以下链接 37 | 38 | [Mac VMware Fusion CentOS7配置静态IP](https://www.cnblogs.com/itbsl/p/10998696.html) 39 | 40 | windows参考以下链接 41 | 42 | [windows配置链接](https://www.baidu.com) 43 | 44 | ## 配置服务器 45 | 46 | 从此步开始,开始进行服务器基础环境配置。 47 | 48 | ### 将集群中所有的机器hostname ip 映射,添加到/etc/hosts 49 | 50 | 映射后集群间将不在需要使用ip 51 | 52 | ```shell 53 | 192.168.218.110 node01 54 | 192.168.218.120 node02 55 | 192.168.218.130 node03 56 | ``` 57 | 58 | ### 关闭防火墙 59 | 60 | 关闭防火墙主要是为了集群机器间的通信 61 | 62 | ```shell 63 | # 关闭防火墙 64 | systemctl disable firewalld.service 65 | 66 | # 查看防火墙状态 67 | systemctl status firewalld.service 68 | # 已关闭将输出 Active: inactive (dead) 69 | ``` 70 | 71 | ### 配置网卡及主机名 72 | 73 | 编辑文件`vim /etc/sysconfig/network-scripts/ifcfg-eth0` ,写入如下配置: 74 | 75 | ```shell 76 | TYPE=Ethernet 77 | PROXY_METHOD=none 78 | BROWSER_ONLY=no 79 | DEFROUTE=yes 80 | IPV4_FAILURE_FATAL=no 81 | IPV6INIT=yes 82 | IPV6_AUTOCONF=yes 83 | IPV6_DEFROUTE=yes 84 | IPV6_FAILURE_FATAL=no 85 | IPV6_ADDR_GEN_MODE=stable-privacy 86 | NAME=eth0 87 | DEVICE=eth0 88 | IPV6_PRIVACY=no 89 | PREFIX=24 90 | 91 | ## 下边的几项配置是修改的 92 | #UUID=f22334e3-05d1-450e-a50a-1da9f5f27915 93 | ONBOOT=yes 94 | BOOTPROTO=static 95 | IPADDR=192.168.218.110 # 当前机器的ip 96 | GATEWAY=192.168.218.2 # 路由 要求网段一直 218 数字与ip的一样 97 | DNS1=192.168.218.2 # 同上 98 | ``` 99 | 100 | 配置主机名编辑文件`/etc/hostname`,添加下列 101 | 102 | ```shell 103 | node01 104 | ``` 105 | 106 | 配置完成后,重启网络服务。 107 | 108 | ```shell 109 | service network restart 110 | ``` 111 | 112 | ### 同步服务器时间 113 | 114 | ```shell 115 | # 安装ntpdate 116 | yum -y install ntpdate 117 | 118 | 119 | # 安装完成执行命令 120 | crontab -e 121 | 122 | 123 | # 此时进入文本编辑模式 使用 i 插入下列命令 124 | */1 * * * * /usr/sbin/ntpdate time1.aliyun.com 125 | # 填写完成后,输入 :wq 保存退出 126 | ``` 127 | 128 | ### 添加用户 129 | 130 | 按步骤执行以下命令 131 | 132 | ```shell 133 | # 添加用户组 134 | groupadd hadoop 135 | 136 | 137 | # 创建用户并添加到hadoop组中 138 | useradd -g hadoop hadoop 139 | 140 | 141 | # 使用id命令查看hadoop用户组和hadoop用户创建是否成功 142 | id hadoop 143 | # 正常输出 uid=1000(hadoop) gid=1000(hadoop) groups=1000(hadoop) 144 | 145 | 146 | # 设置hadoop用户密码为hadoop 147 | passwd hadoop 148 | 149 | ``` 150 | 151 | ### 切换到hadoop用户!! 152 | ### 切换到hadoop用户!! 153 | ### 切换到hadoop用户!! 154 | 155 | **谨记:从这里开始未声明使用root用户,默认都是用hadoop用户操作!!!** 156 | **谨记:从这里开始未声明使用root用户,默认都是用hadoop用户操作!!!** 157 | **谨记:从这里开始未声明使用root用户,默认都是用hadoop用户操作!!!** 158 | 159 | ```shell 160 | su - hadoop 161 | ``` 162 | 163 | ### 创建应用安装包以及数据存储目录 164 | 165 | ```java 166 | mkdir -p /hadoop/soft # 软件压缩包存放目录 167 | mkdir -p /hadoop/install # 软件解压后存放目录 168 | mkdir -p /hadoop/datadir # 各应用的数据存放目录 169 | chown -R hadoop:hadoop /hadoop # 将文件夹权限更改为hadoop用户 170 | ``` 171 | 172 | ### 上传安装包以及解压 173 | 174 | #### 上传 175 | 176 | 根据下载链接将需要组件下载到宿主机,由宿主机上传到虚拟机中 177 | 178 | **注意**:这里上传时要使用`hadoop`用户,不然还需要更改文件所属用户!! 179 | 180 | 上传路径为 `/hadoop/soft`~ 181 | 182 | 至于用什么方式,sftp、scp或其他工具都可! 183 | 184 | #### 解压 185 | 186 | 使用`hadoop`用户登录,解压命令直接解压即可 187 | 188 | **注意**:一定要用`hadoop`用户!!! 189 | 190 | ```shell 191 | tar -xzvf hadoop-2.6.0-cdh5.14.2.tar.gz -C /hadoop/install/ 192 | ``` 193 | 194 | ### 配置jdk 195 | 196 | jdk可选择配置全局,也可以选择配置只针对`hadoop`用户。 197 | 198 | 这里我选择配置只针对`hadoop`用户~ 199 | 200 | 命令 `vim ~/.bash_profile` 201 | 202 | ```shell 203 | export JAVA_HOME=/hadoop/install/jdk1.8.0_141 204 | 205 | PATH=$PATH:$HOME/bin:$JAVA_HOME/bin 206 | ``` 207 | 208 | 修改完成使用命令 `source ~/.bash_profile`,更新用户环境变量。 209 | 210 | **验证环境**: 211 | 212 | ```shell 213 | java -verison 214 | 215 | # 正常输出。jdk版本 216 | # 错误输出 找不到命令 217 | ``` 218 | 219 | ## 配置zookeeper 220 | 221 | zookeeper的配置较为简单,只需要添加两个文件即可 222 | 223 | 第一个文件 zoo.cfg,命令 `vim zoo.cfg` 224 | 225 | ```shell 226 | tickTime=2000 227 | initLimit=10 228 | syncLimit=5 229 | clientPort=2181 230 | 231 | # 路径需要根据你的真实情况进行修改 232 | dataDir=/hadoop/datadir/zookeeper/ 233 | # 只修改你的主机hostname就可以,我这里三台机器命名为,`node01`、`node02`、`node03` 234 | server.1=node01:2888:3888 235 | server.2=node02:2888:3888 236 | server.3=node03:2888:3888 237 | ``` 238 | 239 | 第二个文件 myid,进入第一个配置文件中`dataDir`配置的目录,命令 `vim myid`,添加 `1`,即可(**这块每台机器不一样,在我们克隆虚拟机镜像后 240 | 需要手动将其修改!!稍后介绍**)。 241 | 242 | ## 配置hadoop 243 | 244 | **hadoop的配置文件不需要区分节点**,也就是说每个几点的配置文件都是相同的,所以我们在克隆虚拟机镜像前先将其配置好, 245 | 这样在克隆镜像后尽量最小的配置文件改动! 246 | 247 | ### 配置环境变量(参考jdk配置) 248 | 249 | ```shell 250 | export HADOOP_HOME=/hadoop/install/hadoop-2.6.0-cdh5.14.2 251 | 252 | PATH=$PATH:$HOME/bin:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin 253 | ``` 254 | 255 | 修改完成使用命令 `source ~/.bash_profile`,更新用户环境变量。 256 | 257 | ### 配置xml 258 | 259 | 需要修改的xml一共四个,都存放在`/hadoop/install/hadoop-2.6.0-cdh5.14.2/etc/hadoop/`目录下,`core-site.xml`、`hdfs-site.xml`、`yarn-site.xml`、`mapred-site.xml`。 260 | 261 | #### core-site.xml 262 | 263 | [获取core-site.xml](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hadoop/config/core-site.xml) 264 | 265 | #### hdfs-site.xml 266 | 267 | [获取hdfs-site.xml](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hadoop/config/hdfs-site.xml) 268 | 269 | #### yarn-site.xml 270 | 271 | [获取yarn-site.xml](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hadoop/config/yarn-site.xml) 272 | 273 | #### mapred-site.xml 274 | 275 | [获取mapred-site.xml](https://github.com/sev7e0/bigdata-practice/blob/master/src/main/java/com/tools/hadoop/config/mapred-site.xml) 276 | 277 | ### 更改slaves 278 | 279 | `slaves`文件同样存在`/hadoop/install/hadoop-2.6.0-cdh5.14.2/etc/hadoop/`目录中, 280 | 281 | ```shell 282 | # vi slaves 283 | #将localhost这一行删除掉,添加下边三个节点 284 | node01 285 | node02 286 | node03 287 | ``` 288 | 289 | ### 手动创建hadoop所需数据目录 290 | 291 | 下边给出命令,直接整体复制执行即可,若你修改了路径,则需要对应的修改。 292 | 293 | ```shell 294 | mkdir -p /hadoop/datadir/hadoop/tempDatas 295 | mkdir -p /hadoop/datadir/hadoop/namenodeDatas 296 | mkdir -p /hadoop/datadir/hadoop/datanodeDatas 297 | mkdir -p /hadoop/datadir/hadoop/dfs/nn/edits 298 | mkdir -p /hadoop/datadir/hadoop/dfs/snn/name 299 | mkdir -p /hadoop/datadir/hadoop/dfs/nn/snn/edits 300 | mkdir -p /hadoop/datadir/hadoop/yarn/local 301 | mkdir -p /hadoop/datadir/hadoop/journal 302 | ``` 303 | 304 | **注意**:有的同学这块没有注意到上边提到的`hadoop`用户的读写权限,或者用了root创建,导致目录无法写入的异常。该路径一定要属于`hadoop`用户!!! 305 | 306 | ## 复制虚拟机镜像 307 | 308 | 这一步直接关机完整克隆就好了 309 | 310 | **注意**:有的同学复制镜像的同时把虚拟机的mac地址也复制了,这样将会导致其他两台启动后无法使用,若mac地址相同, 311 | 那么重新生成一个mac地址。 312 | 313 | ## 更改其他两台hostname、ip 314 | 315 | 参考[配置网卡及主机名](#jump) 316 | 317 | ## 启动每一个节点虚拟机 318 | 319 | 启动每一台虚拟机! 320 | 321 | ## 配置免密登录 322 | 323 | Linux免密登录,本质上是使用了`公钥登录`。原理很简单,就是用户将自己的`公钥`储存在远程主机上。登录的时候,远程主机会向用户发送一段随机字符串,用户用自己的`私钥`加密后,再发回来。远程主机用事先储存的`公钥`进行解密,如果成功,就证明用户是可信的,直接允许登录shell,不再要求密码。 324 | 325 | **注意**:免密登录是针对每一个不同用户的,所以我们一定要在`hadoop`用户下执行。以下命令要在每一台机器上都执行~~~ 326 | 327 | ```shell 328 | ## 生成密钥 329 | ## 期间需要输入几次回车,直接回车即可 330 | ssh-keygen -t rsa 331 | 332 | ## 发送自己的公钥到每一台机器上,包括自己本身 333 | ## 由于每条命令都需要输入对方的密码,所以要一条一条的执行!!! 334 | ssh-copy-id -i ~/.ssh/id_rsa.pub node01 335 | ssh-copy-id -i ~/.ssh/id_rsa.pub node02 336 | ssh-copy-id -i ~/.ssh/id_rsa.pub node03 337 | ``` 338 | 339 | **注意**:一定要验证是否成功,在每台机器上相互`ssh`不需要密码就能登录,那么就说明免密登录配置成功!!! 340 | 341 | ```shell 342 | ssh node01 343 | ``` 344 | 345 | ## zookeeper启动 346 | 347 | ### 手动启动每一台节点 348 | 349 | **注意**:在启动前,我们要把刚刚的zookeeper配置中myid更改一下,才可以启动!!! 350 | 351 | 不同的机器对应不同的myid,从下边配置中获取,node01对应1,以此类推。 352 | 353 | ```shell 354 | server.1=node01:2888:3888 355 | server.2=node02:2888:3888 356 | server.3=node03:2888:3888 357 | ``` 358 | 359 | ```shell 360 | # 启动zk 361 | # 在每一台机器上执行 362 | /hadoop/install/zookeeper-3.4.5-cdh5.14.2/bin/zkServer.sh start 363 | # 检查状态 364 | /hadoop/install/zookeeper-3.4.5-cdh5.14.2/bin/zkServer.sh status 365 | ``` 366 | 367 | ### 脚本启动所有节点 368 | 369 | ```shell 370 | #!/bin/bash --login 371 | 372 | zookeeper=$1 373 | path=$2 374 | command=$3 375 | 376 | A=start status stop 377 | 378 | start(){ 379 | echo "$1 zookeeper on $2" 380 | ssh -l hadoop $2 "$3 $1" 381 | } 382 | 383 | if [ "$zookeeper" == "" ] || [ "$command" == "" ];then 384 | echo "usage:'node01 node02 node03' ./zkServer.sh [start status stop]" 385 | exit 0 386 | fi 387 | 388 | # 判断是否为支持的命令 389 | for c in $A 390 | do 391 | if [ "$command" != "$c" ];then 392 | echo "当前只支持:[start status stop]命令" 393 | exit 0 394 | fi 395 | done 396 | 397 | if [ "$command" != "" ];then 398 | for zk in $zookeeper 399 | do 400 | start $command $zk $path 401 | done 402 | else 403 | echo "请输入正确命令" 404 | echo "'node01 node02 node03' ./zkServer.sh [start status stop]" 405 | fi 406 | ``` 407 | 408 | 启动只需要在主节点执行脚本即可!!! 409 | 410 | ```shell 411 | ./zkcluster_run.sh 'node01 node02 node03' /hadoop/install/zookeeper-3.4.5-cdh5.14.2/bin/zkServer.sh start 412 | ``` 413 | 414 | ## hadoop格式化并启动 415 | 416 | ### 格式化namenode 417 | 418 | 初始化的目的就是为了hdfs的元数据信息的初始化。 419 | 420 | **注意:** NameNode格式化只能在node01执行一次,不然会导致集群启动失败,!!!! 421 | 422 | 命令 423 | 424 | ```shell 425 | hdfs namenode -format 426 | ``` 427 | 428 | 成功的标志: 429 | 430 | ```log 431 | 19/08/23 04:32:34 INFO namenode.NameNode: STARTUP_MSG: 432 | /************************************************************ 433 | STARTUP_MSG: Starting NameNode 434 | STARTUP_MSG: user = hadoop 435 | STARTUP_MSG: host = ...... 436 | STARTUP_MSG: args = [-format] 437 | STARTUP_MSG: version = 2.6.0-cdh5.14.2 438 | #显示格式化成功。。。 439 | cdh5.14.2/hadoopDatas/namenodeDatas has been successfully formatted. 440 | 19/08/23 04:32:35 INFO common.Storage: Storage directory /hadoop/install/hadoop-2.6.0-cdh5.14.2/hadoopDatas/dfs/nn/edits has been successfully formatted. 441 | 19/08/23 04:32:35 INFO namenode.FSImageFormatProtobuf: Saving image file /hadoop/install/hadoop-2.6.0-cdh5.14.2/hadoopDatas/namenodeDatas/current/fsimage.ckpt_0000000000000000000 using no compression 442 | 19/08/23 04:32:35 INFO namenode.FSImageFormatProtobuf: Image file /hadoop/install/hadoop-2.6.0-cdh5.14.2/hadoopDatas/namenodeDatas/current/fsimage.ckpt_0000000000000000000 of size 323 bytes saved in 0 seconds. 443 | 19/08/23 04:32:35 INFO namenode.NNStorageRetentionManager: Going to retain 1 images with txid >= 0 444 | 19/08/23 04:32:35 INFO util.ExitUtil: Exiting with status 0 445 | 19/08/23 04:32:35 INFO namenode.NameNode: SHUTDOWN_MSG: 446 | #此处省略部分日志 447 | /************************************************************ 448 | SHUTDOWN_MSG: Shutting down NameNode at ..... 449 | ************************************************************/ 450 | ``` 451 | 452 | ### 同步namenode 453 | 454 | 在master的NameNode启动之后,我们进行对NameNode的数据同步 455 | 在standby-master(也就是我们node02)输入以下命令,输出的日志和上边的相仿 456 | 457 | ```shell 458 | hdfs namenode -bootstrapStandby 459 | ``` 460 | 461 | 如上步骤都顺利的话接下来就可以启动集群了! 462 | 463 | ### 启动集群 464 | 465 | 两种方式~ 466 | 467 | ```shell 468 | start-all.sh 469 | # 不过这种方式官方已经不在推荐了 470 | ``` 471 | 472 | 可以使用如下启动 473 | 474 | ```shell 475 | # 启动hdfs 476 | start-dfs.sh 477 | # 启动yarn 478 | start-yarn.sh 479 | ``` 480 | 481 | ### 查看进程 482 | 483 | node01大概长这样,其他两台节点参考[集群规划](#Plan),部署了的那么一定存在进程,若不存在进程,那么需要查看日志解决问题~ 484 | 485 | ```shell 486 | 12707 DFSZKFailoverController 487 | 12820 ResourceManager 488 | 12327 DataNode 489 | 12521 JournalNode 490 | 12220 NameNode 491 | 12941 NodeManager 492 | 1578 QuorumPeerMain # zookeeper进程,其余全都是hadoop进程 493 | ``` 494 | 495 | ### 查看webUI 496 | 497 | 启动完成后可以通过webUI查看集群的信息,打开下边链接即可查看!! 498 | 499 | 两个节点都可以查看,要确保一个为active,另一个为standby的状态! 500 | 501 | [node01:50070](http://node01:50070) 502 | [node02:50070](http://node02:50070) 503 | 504 | ## 常见问题 505 | 506 | ### 不小心多次格式化namenode 507 | 508 | 若不小心在每台机器上都执行了`hdfs namenode -format`,此时每台节点的集群id将会不一致会导致其他机器无法加入集群! 509 | 510 | **解决办法:**清空[创建的每一个数据目录](#mkdir)!重新执行`hdfs namenode -format`即可!切记!!!只在node01上执行,执行完后要在node02[同步](#bootstrap) 511 | 512 | 513 | ### yarn启动异常 514 | 515 | ```log 516 | 2019-09-30 18:15:49,231 FATAL org.apache.hadoop.yarn.server.resourcemanager.ResourceManager: Error starting ResourceManager 517 | org.apache.hadoop.HadoopIllegalArgumentException: Configuration doesn't specify yarn.resourcemanager.cluster-id 518 | at org.apache.hadoop.yarn.conf.YarnConfiguration.getClusterId(YarnConfiguration.java:1785) 519 | at org.apache.hadoop.yarn.server.resourcemanager.EmbeddedElectorService.serviceInit(EmbeddedElectorService.java:82) 520 | at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 521 | at org.apache.hadoop.service.CompositeService.serviceInit(CompositeService.java:107) 522 | at org.apache.hadoop.yarn.server.resourcemanager.AdminService.serviceInit(AdminService.java:145) 523 | at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 524 | at org.apache.hadoop.service.CompositeService.serviceInit(CompositeService.java:107) 525 | at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceInit(ResourceManager.java:276) 526 | at org.apache.hadoop.service.AbstractService.init(AbstractService.java:163) 527 | at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.main(ResourceManager.java:1309) 528 | ``` 529 | 530 | **解决办法:**在yarn-site.xml中配置一个id,如下。 531 | 532 | ```xml 533 | 534 | yarn.resourcemanager.cluster-id 535 | cluster1 536 | 537 | ``` 538 | 539 | ### ZKFailoverController启动失败问题 540 | 541 | #### 异常一 542 | 543 | ```log 544 | 2019-09-30 18:15:45,010 FATAL org.apache.hadoop.hdfs.tools.DFSZKFailoverController: Got a fatal error, exiting now 545 | java.lang.IllegalArgumentException: Missing required configuration 'ha.zookeeper.quorum' for ZooKeeper quorum 546 | at com.google.common.base.Preconditions.checkArgument(Preconditions.java:115) 547 | at org.apache.hadoop.ha.ZKFailoverController.initZK(ZKFailoverController.java:340) 548 | at org.apache.hadoop.ha.ZKFailoverController.doRun(ZKFailoverController.java:190) 549 | at org.apache.hadoop.ha.ZKFailoverController.access$000(ZKFailoverController.java:60) 550 | at org.apache.hadoop.ha.ZKFailoverController$1.run(ZKFailoverController.java:171) 551 | at org.apache.hadoop.ha.ZKFailoverController$1.run(ZKFailoverController.java:167) 552 | at org.apache.hadoop.security.SecurityUtil.doAsLoginUserOrFatal(SecurityUtil.java:444) 553 | at org.apache.hadoop.ha.ZKFailoverController.run(ZKFailoverController.java:167) 554 | at org.apache.hadoop.hdfs.tools.DFSZKFailoverController.main(DFSZKFailoverController.java:192) 555 | ``` 556 | 557 | **解决办法:** 558 | 559 | - 确认是否配置了 560 | 561 | ```xml 562 | 563 | 564 | ha.zookeeper.quorum 565 | node01:2181,node02:2181,node03:2181 566 | 567 | ``` 568 | 569 | - 检查服务器时间是否同步 570 | 571 | [如何同步服务器时间](#ntpdate) 572 | 573 | **注意:** 同步需要在root用户下。 574 | 575 | #### 异常二 576 | 577 | ```log 578 | 2019-09-30 15:42:05,418 FATAL org.apache.hadoop.ha.ZKFailoverController: Unable to start failover controller. Parent znode does not exist. 579 | Run with -formatZK flag to initialize ZooKeeper. 580 | ``` 581 | 582 | **解决办法:** 此刻以为这你的hadoop节点还没有注册到zookeeper中,需要初始化。 583 | 584 | ```shell 585 | # 执行命令进行初始化 586 | hdfs zkfc -formatZK 587 | ``` 588 | 589 | 重新起动集群即可。 -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/HBaseFilter.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.hbase.CellUtil; 6 | import org.apache.hadoop.hbase.TableName; 7 | import org.apache.hadoop.hbase.client.Connection; 8 | import org.apache.hadoop.hbase.client.ConnectionFactory; 9 | import org.apache.hadoop.hbase.client.Scan; 10 | import org.apache.hadoop.hbase.client.Table; 11 | import org.apache.hadoop.hbase.filter.Filter; 12 | import org.apache.hadoop.hbase.filter.PrefixFilter; 13 | import org.apache.hadoop.hbase.filter.RandomRowFilter; 14 | import org.apache.hadoop.hbase.util.Bytes; 15 | 16 | import java.io.IOException; 17 | 18 | @Slf4j 19 | public class HBaseFilter { 20 | 21 | private static Connection connection; 22 | static { 23 | Configuration configuration = new Configuration(); 24 | configuration.set("hbase.zookeeper.quorum", "spark01"); 25 | configuration.set("hbase.zookeeper.property.clientPort", "2181"); 26 | try { 27 | connection = ConnectionFactory.createConnection(configuration); 28 | } catch (IOException e) { 29 | log.error(e.getMessage()); 30 | } 31 | } 32 | 33 | public static void main(String[] args) throws IOException { 34 | Table table = connection.getTable(TableName.valueOf(HBaseTestUtil.getTableName("table_20191108"))); 35 | 36 | // rowId前缀过滤 37 | log.warn("PrefixFilter"); 38 | Filter pf = new PrefixFilter(Bytes.toBytes("1")); 39 | Scan scan00 = new Scan().setFilter(pf); 40 | table.getScanner(scan00).forEach(res-> log.info(Bytes.toString(res.getValue(HBaseTestUtil.getFamilyName(null), "data_stamp".getBytes())))); 41 | 42 | //随机百分比过滤 43 | log.warn("RandomRowFilter"); 44 | Filter randomRowFilter = new RandomRowFilter(0.003f); 45 | Scan scan01 = new Scan().setFilter(randomRowFilter); 46 | table.getScanner(scan01).forEach(res-> res.listCells().forEach(cell -> log.info("{} : {} : {} : {} : {}", 47 | Bytes.toString(CellUtil.cloneRow(cell)), 48 | Bytes.toString(CellUtil.cloneFamily(cell)), 49 | Bytes.toString(CellUtil.cloneQualifier(cell)), 50 | Bytes.toString(CellUtil.cloneValue(cell)), 51 | cell.getTimestamp()))); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/HBaseReadWrite.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.hbase.HBaseConfiguration; 6 | import org.apache.hadoop.hbase.HColumnDescriptor; 7 | import org.apache.hadoop.hbase.HTableDescriptor; 8 | import org.apache.hadoop.hbase.TableName; 9 | import org.apache.hadoop.hbase.client.*; 10 | import org.apache.hadoop.hbase.util.Bytes; 11 | 12 | import java.io.IOException; 13 | 14 | @Slf4j 15 | public class HBaseReadWrite { 16 | 17 | private Connection connection = null; 18 | 19 | public static void main(String[] args) throws IOException { 20 | HBaseReadWrite readWrite = new HBaseReadWrite(); 21 | readWrite.init(); 22 | readWrite.creatTable(); 23 | for (int i = 0; i < 10000; i++) { 24 | long stamp = System.currentTimeMillis(); 25 | String data = "data_"+stamp; 26 | System.out.println("insert data :"+data+""); 27 | readWrite.insert(HBaseTestUtil.getTableName(null), String.valueOf(stamp), HBaseTestUtil.getFamilyName(null), "data_stamp".getBytes(), data); 28 | } 29 | 30 | readWrite.scan(HBaseTestUtil.getTableName(null), HBaseTestUtil.getFamilyName(null), "data_stamp"); 31 | } 32 | 33 | /** 34 | * 初始化连接 35 | * 36 | * @throws IOException 37 | */ 38 | private void init() throws IOException { 39 | Configuration configuration = HBaseConfiguration.create(); 40 | 41 | connection = ConnectionFactory.createConnection(configuration); 42 | } 43 | 44 | /** 45 | * 创建表 46 | * 47 | * @throws IOException 48 | */ 49 | private void creatTable() throws IOException { 50 | Admin admin = connection.getAdmin(); 51 | TableName tableName = TableName.valueOf(HBaseTestUtil.getTableName(null)); 52 | if (admin.tableExists(tableName)) { 53 | // hbase 在删除表之前要先 disable 54 | admin.disableTable(tableName); 55 | admin.deleteTable(tableName); 56 | } 57 | HTableDescriptor descriptor = new HTableDescriptor(tableName); 58 | descriptor.addFamily(new HColumnDescriptor(HBaseTestUtil.getFamilyName(null))); 59 | admin.createTable(descriptor); 60 | } 61 | 62 | /** 63 | * 插入数据 64 | * 65 | * @param tableN 表名 66 | * @param rowId row id 67 | * @param familyName 列族 68 | * @param qualifier 列 69 | * @param value 数据 70 | * @throws IOException 71 | */ 72 | private void insert(byte[] tableN, String rowId, byte[] familyName, byte[] qualifier, String value) throws IOException { 73 | TableName tableName = TableName.valueOf(tableN); 74 | Table table = connection.getTable(tableName); 75 | Put put = new Put(rowId.getBytes()); 76 | put.addColumn(familyName, qualifier, value.getBytes()); 77 | table.put(put); 78 | } 79 | 80 | /** 81 | * @param tableN 表名 82 | * @param familyN 列族 83 | * @param qualifier 列 84 | * @throws IOException 85 | */ 86 | private void scan(byte[] tableN, byte[] familyN, String qualifier) throws IOException { 87 | TableName tableName = TableName.valueOf(tableN); 88 | Table table = connection.getTable(tableName); 89 | Scan scan = new Scan(); 90 | ResultScanner scanner = table.getScanner(scan); 91 | scanner.forEach(data -> System.out.println((Bytes.toString(data.getValue(familyN, qualifier.getBytes()))))); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/HBaseTestUtil.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase; 2 | 3 | import org.apache.hadoop.hbase.util.Bytes; 4 | 5 | import java.time.LocalDate; 6 | import java.time.format.DateTimeFormatter; 7 | import java.util.Objects; 8 | 9 | public class HBaseTestUtil { 10 | 11 | public static void main(String[] args) { 12 | System.out.println(Bytes.toString(getTableName(null))); 13 | } 14 | 15 | public static byte[] getTableName(String name) { 16 | String format = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE); 17 | return Objects.isNull(name) ? ("table_" + format).getBytes() : name.getBytes(); 18 | } 19 | 20 | public static byte[] getFamilyName(String name) { 21 | String format = LocalDate.now().format(DateTimeFormatter.BASIC_ISO_DATE); 22 | return Objects.isNull(name) ? ("family_" + format).getBytes() : name.getBytes(); 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/HBase读写的几种方式.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/src/main/java/com/tools/hbase/HBase读写的几种方式.pdf -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/Utils.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.hbase.HBaseConfiguration; 7 | import org.apache.hadoop.hbase.TableName; 8 | import org.apache.hadoop.hbase.client.Connection; 9 | import org.apache.hadoop.hbase.client.ConnectionFactory; 10 | import org.apache.hadoop.hbase.client.RegionLocator; 11 | import org.apache.hadoop.hbase.client.Table; 12 | import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles; 13 | 14 | /** 15 | * HBase Utils Class 16 | */ 17 | public class Utils { 18 | 19 | /** 20 | * HBase Bulk Load method replace command line 21 | * 22 | * @param configuration hadoop config 23 | * @param hFilePath HFile path 24 | * @param table table name 25 | */ 26 | public static void doBulkLoad(Configuration configuration, String hFilePath, String table) { 27 | try { 28 | FileSystem fileSystem = FileSystem.newInstance(configuration); 29 | // add HBase config to Configuration object 30 | HBaseConfiguration.addHbaseResources(configuration); 31 | LoadIncrementalHFiles loadIncrementalHFiles = new LoadIncrementalHFiles(configuration); 32 | 33 | // create HBase connection 34 | try (Connection connection = ConnectionFactory.createConnection(configuration)) { 35 | Table connectionTable = connection.getTable(TableName.valueOf(table)); 36 | RegionLocator regionLocator = connection.getRegionLocator(connectionTable.getName()); 37 | // new client api for HBase 1.0.0+ 38 | loadIncrementalHFiles.doBulkLoad(new Path(hFilePath), connection.getAdmin(), connectionTable, regionLocator); 39 | } 40 | } catch (Exception e) { 41 | e.printStackTrace(); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/hdfs2hbase/HDFS2HBase.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase.hdfs2hbase; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.hbase.client.Put; 6 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 7 | import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; 8 | import org.apache.hadoop.hbase.mapreduce.TableReducer; 9 | import org.apache.hadoop.hbase.util.Bytes; 10 | import org.apache.hadoop.io.LongWritable; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.mapreduce.Job; 14 | import org.apache.hadoop.mapreduce.Mapper; 15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 16 | import java.io.IOException; 17 | 18 | public class HDFS2HBase { 19 | 20 | public static class HBaseMap extends Mapper{ 21 | @Override 22 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 23 | context.write(value,NullWritable.get()); 24 | } 25 | } 26 | 27 | public static class HBaseReduce extends TableReducer{ 28 | @Override 29 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 30 | String[] data = key.toString().split("\t"); 31 | Put put = new Put(data[0].getBytes()); 32 | put.addColumn("info".getBytes(),"name".getBytes(),data[1].getBytes()); 33 | put.addColumn("info".getBytes(),"sex".getBytes(),data[2].getBytes()); 34 | put.addColumn("course".getBytes(),"match".getBytes(),data[3].getBytes()); 35 | put.addColumn("course".getBytes(),"chinese".getBytes(),data[4].getBytes()); 36 | context.write(new ImmutableBytesWritable(Bytes.toBytes(data[0])),put); 37 | } 38 | } 39 | 40 | 41 | public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 42 | Configuration configuration = new Configuration(); 43 | Job job = Job.getInstance(configuration); 44 | job.setJarByClass(HDFS2HBase.class); 45 | job.setInputFormatClass(TextInputFormat.class); 46 | TextInputFormat.addInputPath(job, new Path(args[0])); 47 | job.setMapperClass(HBaseMap.class); 48 | job.setMapOutputKeyClass(Text.class); 49 | job.setMapOutputValueClass(NullWritable.class); 50 | 51 | /** 52 | * reduce任务交由Hbase完善 53 | */ 54 | TableMapReduceUtil.initTableReducerJob(args[1],HBaseReduce.class,job); 55 | job.setNumReduceTasks(1); 56 | job.waitForCompletion(false); 57 | 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/hdfs2hbase/scores.txt: -------------------------------------------------------------------------------- 1 | 1001 lx m 97 93 2 | 1002 xm f 92 84 3 | 1003 bruce m 90 83 4 | 1100 tony m 95 76 5 | 1110 dl f 87 96 6 | 1005 xl m 85 78 7 | 1006 eh f 82 99 8 | 1009 tg f 75 95 9 | 2000 xingxing m 77 63 10 | 1007 xw m 99 65 11 | 1301 qy f 95 73 12 | 1303 qo f 88 79 13 | 1307 sn m 76 82 14 | 1402 sw f 92 91 15 | 1404 sp f 91 83 16 | 1408 wd f 83 94 17 | 1513 wf m 75 86 18 | 1515 xl f 77 85 19 | 1519 kl f 96 77 20 | 1520 zs m 82 79 21 | 1621 ls m 90 96 22 | 1624 ll m 61 94 23 | 1627 cm f 52 85 24 | 1629 cr f 79 81 25 | 1730 cz m 95 87 26 | 1733 hk f 93 56 27 | 1734 rk m 86 88 28 | 1739 zy m 84 99 29 | 1885 zf f 72 71 30 | 1887 gy f 71 86 -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/processor/HBasePerson.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase.processor; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.hbase.HBaseConfiguration; 6 | import org.apache.hadoop.hbase.HColumnDescriptor; 7 | import org.apache.hadoop.hbase.HTableDescriptor; 8 | import org.apache.hadoop.hbase.TableName; 9 | import org.apache.hadoop.hbase.client.Connection; 10 | import org.apache.hadoop.hbase.client.ConnectionFactory; 11 | import org.apache.hadoop.hbase.client.Put; 12 | import org.apache.hadoop.hbase.client.Table; 13 | 14 | import java.io.IOException; 15 | 16 | public class HBasePerson { 17 | 18 | public static void main(String[] args) { 19 | 20 | Configuration configuration = HBaseConfiguration.create(); 21 | configuration.set("hbase.zookeeper.quorum", "spark01:2181,spark02:2181,spark03:2181"); 22 | configuration.set("hbase.table.sanity.checks", "false"); 23 | 24 | try (Connection connection = ConnectionFactory.createConnection(configuration)) { 25 | TableName tableName = TableName.valueOf("person"); 26 | if (!connection.getAdmin().tableExists(tableName)) { 27 | HTableDescriptor hTableDescriptor = new HTableDescriptor(tableName); 28 | HColumnDescriptor info = new HColumnDescriptor("info"); 29 | hTableDescriptor.addFamily(info); 30 | Path path = new Path("hdfs://spark01:9000/bigdata-practice-0.jar"); 31 | hTableDescriptor.addCoprocessor(HBaseProcessor.class.getCanonicalName(), path, HBaseProcessor.PRIORITY_USER, null); 32 | connection.getAdmin().createTable(hTableDescriptor); 33 | } 34 | Put put = new Put("0002".getBytes()); 35 | put.addColumn("info".getBytes(), "name".getBytes(), "lishengnan".getBytes()); 36 | put.addColumn("info".getBytes(), "age".getBytes(), "18".getBytes()); 37 | try (Table person = connection.getTable(tableName)) { 38 | person.put(put); 39 | } 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | } 43 | 44 | 45 | } 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hbase/processor/HBaseProcessor.java: -------------------------------------------------------------------------------- 1 | package com.tools.hbase.processor; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hbase.*; 5 | import org.apache.hadoop.hbase.client.*; 6 | import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver; 7 | import org.apache.hadoop.hbase.coprocessor.ObserverContext; 8 | import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment; 9 | import org.apache.hadoop.hbase.regionserver.wal.WALEdit; 10 | 11 | import java.io.IOException; 12 | import java.util.List; 13 | 14 | /** 15 | * HBase协处理器实践 16 | * HBase提供四种协处理器: 17 | * 18 | * RegionObserver 针对get put delete等操作的 19 | * RegionServerObserver 针对RegionServer的 20 | * WALObserver 针对日志的如滚动、删除 21 | * MasterObserver 针对表结构的创建、修改、删除 22 | * 23 | * EndpointObserver 作用是将用户层的逻辑下推到数据层执行,将大量处理结果放在HBase中执行,需要手动调用 24 | * 25 | */ 26 | public class HBaseProcessor extends BaseRegionObserver { 27 | 28 | @Override 29 | public void prePut(ObserverContext e, Put put, WALEdit edit, Durability durability) throws IOException { 30 | Configuration configuration = HBaseConfiguration.create(); 31 | configuration.set("hbase.zookeeper.quorum", "spark01:2181,spark02:2181,spark03:2181"); 32 | try (Connection connection = ConnectionFactory.createConnection(configuration)) { 33 | TableName person_back = TableName.valueOf("person_back"); 34 | //表不存在时创建表 35 | if (!connection.getAdmin().tableExists(person_back)){ 36 | HColumnDescriptor columnDescriptor = new HColumnDescriptor("info"); 37 | HTableDescriptor hTableDescriptor = new HTableDescriptor(person_back); 38 | hTableDescriptor.addFamily(columnDescriptor); 39 | connection.getAdmin().createTable(hTableDescriptor); 40 | } 41 | List cells = put.get("info".getBytes(), "name".getBytes()); 42 | if (cells.isEmpty()) { 43 | return; 44 | } 45 | Cell cell = cells.get(0); 46 | Put put1 = new Put(put.getRow()); 47 | put1.add(cell); 48 | try (Table person = connection.getTable(person_back)) { 49 | person.put(put1); 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hive/MyUDAF.java: -------------------------------------------------------------------------------- 1 | package com.tools.hive; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDAF; 4 | 5 | /** 6 | * Title: MyUDAF.java 7 | * description: TODO 8 | * 9 | * @author sev7e0 10 | * @version 1.0 11 | * @since 2020-05-06 22:35 12 | **/ 13 | 14 | public class MyUDAF extends UDAF { 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hive/MyUDF.java: -------------------------------------------------------------------------------- 1 | package com.tools.hive; 2 | 3 | 4 | import com.alibaba.fastjson.JSONObject; 5 | import org.apache.hadoop.hive.ql.exec.Description; 6 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 7 | import org.apache.hadoop.hive.ql.metadata.HiveException; 8 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; 9 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 10 | import org.apache.hadoop.io.Text; 11 | 12 | import java.util.Objects; 13 | 14 | public class MyUDF extends GenericUDF { 15 | 16 | @Description( 17 | name = "用户自定义函数", 18 | value = "将json转化为自定义表结构", 19 | extended = "select MyUDF(data,'movie') from json;" 20 | ) 21 | public Text evaluate(final String input, String key) { 22 | if (Objects.isNull(input)) { 23 | return null; 24 | } 25 | JSONObject jsonObject = JSONObject.parseObject(input); 26 | return new Text(String.valueOf(jsonObject.get(key))); 27 | } 28 | 29 | public static void main(String[] args) { 30 | MyUDF myUDF = new MyUDF(); 31 | Text movie = myUDF.evaluate("{'movie':'1193','rate':'5','timeStamp':'978300760','uid':'1'}", "movie"); 32 | System.out.println(movie.toString()); 33 | } 34 | 35 | @Override 36 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 37 | return null; 38 | } 39 | 40 | @Override 41 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 42 | return null; 43 | } 44 | 45 | @Override 46 | public String getDisplayString(String[] children) { 47 | return null; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hive/MyUDTF.java: -------------------------------------------------------------------------------- 1 | package com.tools.hive; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 4 | import org.apache.hadoop.hive.ql.metadata.HiveException; 5 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 6 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 7 | 8 | /** 9 | * Title: MyUDTF.java 10 | * description: TODO 11 | * 12 | * @author sev7e0 13 | * @version 1.0 14 | * @since 2020-05-06 22:53 15 | **/ 16 | 17 | public class MyUDTF extends GenericUDTF { 18 | 19 | @Override 20 | public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { 21 | return super.initialize(argOIs); 22 | } 23 | 24 | @Override 25 | public void process(Object[] args) throws HiveException { 26 | 27 | } 28 | 29 | @Override 30 | public void close() throws HiveException { 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/tools/hive/sql/test.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/src/main/java/com/tools/hive/sql/test.sql -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/CustomPartitioner.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka; 2 | 3 | import org.apache.kafka.clients.producer.Partitioner; 4 | import org.apache.kafka.common.Cluster; 5 | 6 | import java.util.Map; 7 | import java.util.Random; 8 | 9 | public class CustomPartitioner implements Partitioner { 10 | @Override 11 | public int partition(String topic, Object key, byte[] keyBytes, Object value, byte[] valueBytes, Cluster cluster) { 12 | 13 | int size = cluster.availablePartitionsForTopic(topic).size(); 14 | Random random = new Random(100); 15 | return Math.abs(String.valueOf(random.nextInt()).hashCode()%size); 16 | } 17 | 18 | @Override 19 | public void close() { 20 | 21 | } 22 | 23 | @Override 24 | public void configure(Map configs) { 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/consumer/ConsumerCommitOffset.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.consumer; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.consumer.ConsumerConfig; 5 | import org.apache.kafka.clients.consumer.ConsumerRecord; 6 | import org.apache.kafka.clients.consumer.ConsumerRecords; 7 | import org.apache.kafka.clients.consumer.KafkaConsumer; 8 | import org.apache.kafka.common.serialization.StringDeserializer; 9 | 10 | import java.time.Duration; 11 | import java.util.Collections; 12 | import java.util.Properties; 13 | 14 | /** 15 | * 手动控制提交offset 16 | */ 17 | @Slf4j 18 | public class ConsumerCommitOffset { 19 | public static final String brokerList = "localhost:9092"; 20 | public static final String topic = "topic-1"; 21 | //新的group,相较于ConsumerQuickStart group-1分组,现在kafka是发布订阅模型 22 | public static final String groupId = "group-2"; 23 | public static final String out = "topic={} - partition={} - offset={} - value={}"; 24 | 25 | /** 26 | * 初始化配置 27 | * 28 | * @return 29 | */ 30 | private static Properties initProperties() { 31 | Properties properties = new Properties(); 32 | 33 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); 34 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); 35 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); 36 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); 37 | 38 | //关闭kafka默认的自动提交offset,容易导致重复处理的问题 39 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false); 40 | return properties; 41 | } 42 | 43 | 44 | public static void main(String[] args) { 45 | 46 | try (KafkaConsumer consumer = new KafkaConsumer<>(initProperties())) { 47 | consumer.subscribe(Collections.singletonList(topic)); 48 | try { 49 | while (true) { 50 | ConsumerRecords records = consumer.poll(Duration.ofMillis(1000)); 51 | // lo.info(out, 52 | // record.topic(), 53 | // record.partition(), 54 | // record.offset(), 55 | // record.value())); 56 | //异步提交offset 57 | for (ConsumerRecord record : records) { 58 | consumer.commitAsync(); 59 | } 60 | } 61 | } finally { 62 | //使用同步提交,做最后的把关 63 | consumer.commitSync(); 64 | } 65 | 66 | } 67 | } 68 | 69 | } 70 | 71 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/consumer/ConsumerInterceptorTTL.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.consumer; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.consumer.ConsumerInterceptor; 5 | import org.apache.kafka.clients.consumer.ConsumerRecord; 6 | import org.apache.kafka.clients.consumer.ConsumerRecords; 7 | import org.apache.kafka.clients.consumer.OffsetAndMetadata; 8 | import org.apache.kafka.common.TopicPartition; 9 | 10 | import java.util.ArrayList; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | /** 16 | * 自定义消费者拦截器 17 | *

18 | * 实现消息过期时间的功能 19 | */ 20 | @Slf4j 21 | public class ConsumerInterceptorTTL implements ConsumerInterceptor { 22 | @Override 23 | public ConsumerRecords onConsume(ConsumerRecords consumerRecords) { 24 | long timeMillis = System.currentTimeMillis(); 25 | 26 | Map>> map = new HashMap<>(); 27 | 28 | consumerRecords.partitions().forEach(topicPartition -> { 29 | List> recordList = consumerRecords.records(topicPartition); 30 | List> newConsumerRecords = new ArrayList<>(); 31 | 32 | recordList.forEach(record -> { 33 | if (timeMillis - record.timestamp() < 10 * 1000) { 34 | newConsumerRecords.add(record); 35 | } 36 | }); 37 | if (!newConsumerRecords.isEmpty()) { 38 | map.put(topicPartition, newConsumerRecords); 39 | } 40 | }); 41 | return new ConsumerRecords<>(map); 42 | } 43 | 44 | @Override 45 | public void close() { 46 | 47 | } 48 | 49 | @Override 50 | public void onCommit(Map map) { 51 | map.forEach((tp, offset) -> System.out.println("tp:{"+tp+"}--offset:{"+offset.offset()+"}")); 52 | } 53 | 54 | @Override 55 | public void configure(Map map) { 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/consumer/ConsumerReBalance.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.consumer; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.consumer.*; 5 | import org.apache.kafka.common.TopicPartition; 6 | import org.apache.kafka.common.serialization.StringDeserializer; 7 | 8 | import java.time.Duration; 9 | import java.util.*; 10 | 11 | /** 12 | * ReBalance监听器的用法,如何做到减少重复消费。 13 | */ 14 | @Slf4j 15 | public class ConsumerReBalance { 16 | public static final String brokerList = "localhost:9092"; 17 | public static final String topic = "topic-1"; 18 | //新的group,相较于ConsumerQuickStart group-1分组,现在kafka是发布订阅模型 19 | public static final String groupId = "group-3"; 20 | public static final String out = "topic={} - partition={} - offset={} - value={}"; 21 | 22 | /** 23 | * 初始化配置 24 | * 25 | * @return 26 | */ 27 | private static Properties initProperties() { 28 | Properties properties = new Properties(); 29 | 30 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); 31 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class.getName()); 32 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); 33 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); 34 | //添加自定义消费者拦截器,可以使用多个拦截器构成拦截链 35 | //当某个拦截器失败时,下一个会自动从上一个成功后的拦截器开始拦截 36 | properties.put(ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG, ConsumerInterceptorTTL.class.getName()); 37 | 38 | //关闭kafka默认的自动提交offset,容易导致重复处理的问题 39 | properties.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, false); 40 | return properties; 41 | } 42 | 43 | 44 | public static void main(String[] args) { 45 | 46 | try (KafkaConsumer consumer = new KafkaConsumer<>(initProperties())) { 47 | Map map = new HashMap<>(); 48 | consumer.subscribe(Collections.singletonList(topic), new ConsumerRebalanceListener() { 49 | @Override 50 | public void onPartitionsRevoked(Collection collection) { 51 | //同步提交 52 | consumer.commitSync(map); 53 | //亦可以选择存储到DB中。 54 | } 55 | 56 | @Override 57 | public void onPartitionsAssigned(Collection collection) { 58 | 59 | } 60 | }); 61 | 62 | try { 63 | while (true) { 64 | ConsumerRecords records = consumer.poll(Duration.ofMillis(1000)); 65 | records.forEach(record -> { 66 | log.info(out, 67 | record.topic(), 68 | record.partition(), 69 | record.offset(), 70 | record.value()); 71 | ///将offset存储到局部变量中,在ReBalance发生前,能够同步的提交offset避免重复消费 72 | map.put(new TopicPartition(record.topic(), record.partition()), 73 | new OffsetAndMetadata(record.offset() + 1)); 74 | } 75 | ); 76 | //异步提交offset 77 | consumer.commitAsync(map, null); 78 | } 79 | } finally { 80 | //使用同步提交,做最后的把关 81 | consumer.commitSync(); 82 | } 83 | 84 | } 85 | } 86 | 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/consumer/ConsumerThread.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.consumer; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.consumer.ConsumerRecord; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | import org.apache.kafka.clients.consumer.OffsetAndMetadata; 8 | import org.apache.kafka.common.TopicPartition; 9 | 10 | import java.time.Duration; 11 | import java.util.Collections; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Properties; 15 | import java.util.concurrent.ArrayBlockingQueue; 16 | import java.util.concurrent.ExecutorService; 17 | import java.util.concurrent.ThreadPoolExecutor; 18 | import java.util.concurrent.TimeUnit; 19 | 20 | /** 21 | * 客户端消费 多线程方式实现 22 | */ 23 | @Slf4j 24 | public class ConsumerThread extends Thread { 25 | private KafkaConsumer kafkaConsumer; 26 | 27 | private ExecutorService executorService; 28 | 29 | private int threadNum; 30 | 31 | 32 | public ConsumerThread(Properties properties, String topic, int threadNum) { 33 | kafkaConsumer = new KafkaConsumer<>(properties); 34 | kafkaConsumer.subscribe(Collections.singletonList(topic)); 35 | this.threadNum = threadNum; 36 | 37 | executorService = new ThreadPoolExecutor( 38 | threadNum, 39 | threadNum, 40 | 0L, 41 | TimeUnit.MILLISECONDS, 42 | new ArrayBlockingQueue<>(1000), 43 | new ThreadPoolExecutor.CallerRunsPolicy()); 44 | } 45 | 46 | @Override 47 | public void run() { 48 | try { 49 | while (true) { 50 | ConsumerRecords records = kafkaConsumer.poll(Duration.ofMillis(100)); 51 | 52 | if (!records.isEmpty()) { 53 | executorService.submit(new RecordHandler(records)); 54 | } 55 | synchronized (RecordHandler.offsets) { 56 | if (!RecordHandler.offsets.isEmpty()) { 57 | kafkaConsumer.commitSync(RecordHandler.offsets, null); 58 | RecordHandler.offsets.clear(); 59 | } 60 | } 61 | } 62 | } catch (Exception e) { 63 | log.error(e.getMessage()); 64 | } finally { 65 | kafkaConsumer.close(); 66 | } 67 | 68 | } 69 | } 70 | 71 | class RecordHandler extends Thread { 72 | private ConsumerRecords records; 73 | 74 | public static Map offsets; 75 | 76 | public RecordHandler(ConsumerRecords records) { 77 | this.records = records; 78 | } 79 | 80 | @Override 81 | public void run() { 82 | records.partitions() 83 | .forEach(partition -> { 84 | List> record = records.records(partition); 85 | 86 | long lastConsumerOffset = record.get(record.size() - 1).offset(); 87 | 88 | synchronized (offsets) { 89 | if (!offsets.containsKey(partition)) { 90 | offsets.put(partition, new OffsetAndMetadata(lastConsumerOffset + 1)); 91 | } else { 92 | long position = offsets.get(partition).offset(); 93 | if (position < lastConsumerOffset + 1) { 94 | offsets.put(partition, new OffsetAndMetadata(lastConsumerOffset + 1)); 95 | } 96 | } 97 | } 98 | }); 99 | } 100 | } -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/producer/ProducerRandomInt.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.producer; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.producer.KafkaProducer; 5 | import org.apache.kafka.clients.producer.ProducerConfig; 6 | import org.apache.kafka.clients.producer.ProducerRecord; 7 | import org.apache.kafka.common.serialization.StringSerializer; 8 | 9 | import java.util.Properties; 10 | import java.util.Random; 11 | 12 | /** 13 | * 向kafka中发送随机数 14 | */ 15 | @Slf4j 16 | public class ProducerRandomInt { 17 | public static final String brokerList = "spark01:9092"; 18 | public static final String topic = "randomCount_new"; 19 | 20 | /** 21 | * 初始化参数 22 | * 23 | * @return 24 | */ 25 | private static Properties initProperties() { 26 | Properties properties = new Properties(); 27 | 28 | //生产者需要序列化器将对象转换成字节数组才能通过网络发送给kafka服务端 29 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 30 | //properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 31 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 32 | 33 | //acks它代表消息确认机制 34 | properties.put("acks", "all"); 35 | //重试的次数 36 | properties.put("retries", 0); 37 | //批处理数据的大小,每次写入多少数据到topic 38 | properties.put("batch.size", 2); 39 | //可以延长多久发送数据 40 | properties.put("linger.ms", 1); 41 | properties.put("partitioner.class", "com.tools.kafka.CustomPartitioner"); 42 | //缓冲区的大小 43 | properties.put("buffer.memory", 33554432); 44 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); 45 | return properties; 46 | } 47 | 48 | public static void main(String[] args) throws InterruptedException { 49 | Properties properties = initProperties(); 50 | 51 | KafkaProducer producer = new KafkaProducer<>(properties); 52 | Random random = new Random(10); 53 | for (int i = 0; i < 1000000; i++) { 54 | String value = "消息内容:"+i+"-----"+random.nextInt(10000); 55 | ProducerRecord record = new ProducerRecord<>(topic, value); 56 | producer.send(record); 57 | log.info("已发送:{}条, value为: {}", i, value); 58 | // Thread.sleep(1000); 59 | } 60 | producer.close(); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/quickstart/ConsumerQuickStart.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.quickstart; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.kafka.clients.consumer.ConsumerConfig; 5 | import org.apache.kafka.clients.consumer.ConsumerRecords; 6 | import org.apache.kafka.clients.consumer.KafkaConsumer; 7 | 8 | import java.time.Duration; 9 | import java.util.Collections; 10 | import java.util.Properties; 11 | 12 | @Slf4j 13 | public class ConsumerQuickStart { 14 | public static final String brokerList = "localhost:9092"; 15 | public static final String topic = "ProducerQuickStart"; 16 | public static final String groupId = "group-1"; 17 | public static final String out = "topic={} - partition={} - offset={} - value={}"; 18 | 19 | /** 20 | * 初始化配置 21 | * 22 | * @return 23 | */ 24 | private static Properties initProperties() { 25 | Properties properties = new Properties(); 26 | 27 | // 28 | properties.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"); 29 | properties.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"); 30 | properties.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); 31 | properties.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); 32 | return properties; 33 | } 34 | 35 | 36 | public static void main(String[] args) { 37 | 38 | KafkaConsumer consumer = new KafkaConsumer<>(initProperties()); 39 | 40 | consumer.subscribe(Collections.singletonList(topic)); 41 | 42 | while (true) { 43 | ConsumerRecords records = consumer.poll(Duration.ofMillis(1000)); 44 | records.forEach(record -> 45 | log.info(out, 46 | record.topic(), 47 | record.partition(), 48 | record.offset(), 49 | record.value())); 50 | } 51 | } 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/quickstart/ProducerQuickStart.java: -------------------------------------------------------------------------------- 1 | package com.tools.kafka.quickstart; 2 | 3 | import org.apache.kafka.clients.producer.KafkaProducer; 4 | import org.apache.kafka.clients.producer.ProducerConfig; 5 | import org.apache.kafka.clients.producer.ProducerRecord; 6 | import org.apache.kafka.common.serialization.StringSerializer; 7 | 8 | import java.util.Properties; 9 | 10 | /** 11 | * kafka生产者。 12 | * 13 | * 主要三大组件: 14 | * - 序列化器 -> 必须配置 15 | * - 分区器 -> 选配 16 | * - 生产者拦截器 -> 选配 17 | * 18 | * 全部配置的情况下执行顺序为 生产者拦截器 -> 序列化器 -> 分区器 19 | */ 20 | public class ProducerQuickStart { 21 | public static final String brokerList = "localhost:9092"; 22 | public static final String topic = "ProducerQuickStart"; 23 | 24 | /** 25 | * 初始化参数 26 | * 27 | * @return 28 | */ 29 | private static Properties initProperties() { 30 | Properties properties = new Properties(); 31 | 32 | //生产者需要序列化器将对象转换成字节数组才能通过网络发送给kafka服务端 33 | properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 34 | // properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 35 | properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class.getName()); 36 | 37 | properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokerList); 38 | 39 | return properties; 40 | } 41 | 42 | public static void main(String[] args) throws InterruptedException { 43 | 44 | //构建producer实例。 45 | KafkaProducer producer = new KafkaProducer<>(initProperties()); 46 | 47 | for (int i = 100; i < 1000; i++) { 48 | 49 | //构建消息实例ProducerRecord 50 | ProducerRecord record = new ProducerRecord<>(topic, "hello kafka-" + i); 51 | 52 | //消息发送 53 | producer.send(record); 54 | 55 | Thread.sleep(5000); 56 | } 57 | 58 | 59 | producer.close(); 60 | } 61 | 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/main/java/com/tools/kafka/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/src/main/java/com/tools/kafka/readme.md -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/CacheTuning.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis; 2 | 3 | import com.google.common.hash.BloomFilter; 4 | import com.google.common.hash.Funnels; 5 | import redis.clients.jedis.Jedis; 6 | 7 | import java.time.Duration; 8 | import java.time.LocalTime; 9 | import java.time.temporal.ChronoField; 10 | import java.util.HashMap; 11 | import java.util.UUID; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | import java.util.concurrent.atomic.AtomicReference; 15 | 16 | /** 17 | * 应对缓存穿透、缓存击穿、缓存雪崩的几种方案 18 | */ 19 | public class CacheTuning { 20 | 21 | private static final Jedis jedis = new Jedis(); 22 | private static final HashMap map = new HashMap<>(); 23 | private static final ExecutorService threadPool = Executors.newFixedThreadPool(10); 24 | 25 | public static void main(String[] args) { 26 | // System.out.println(jedis.ping()); 27 | map.put("1", "a"); 28 | map.put("2", "b"); 29 | map.put("3", "b"); 30 | 31 | // for (int i = 0; i < 1; i++) { 32 | // threadPool.execute(() -> map.keySet().forEach(key -> System.out.printf(get(key)))); 33 | // } 34 | // System.out.println(safeUpdateCache("3")); 35 | 36 | bloomFilter(); 37 | } 38 | 39 | /** 40 | * 缓存穿透 41 | * 42 | * 缓存穿透指的是查询一个不存在数据,由于缓存中一定不存在,那么每一次请求都会被打到db层,这样就会造成db挂掉的问题。 43 | */ 44 | /** 45 | * 1.使用bloom filter进行拦截 46 | *

47 | * 有很多种方法可以有效地解决缓存穿透问题,最常见的则是采用布隆过滤器,将所有可能存在的数据哈希到一个足够大的bitmap中,一个一定不存在的数据会被 这个bitmap拦截掉, 48 | * 从而避免了对底层存储系统的查询压力。另外也有一个更为简单粗暴的方法(我们采用的就是这种),如果一个查询返回的数据为空(不管是数 据不存在,还是系统故障), 49 | * 我们仍然把这个空结果进行缓存,但它的过期时间会很短,最长不超过五分钟。 50 | *

51 | * 2. 无论db返回什么都进行缓存,但如果缓存的为空值,那么可以设置他的过期时间较短,比如五分钟 52 | */ 53 | private static void bloomFilter() { 54 | int size = 1000000; 55 | BloomFilter bloomFilter = BloomFilter.create(Funnels.stringFunnel(), size); 56 | 57 | for (int i = 0; i < size; i++) { 58 | bloomFilter.put(String.valueOf(i)); 59 | } 60 | LocalTime before = LocalTime.now(); 61 | if (bloomFilter.mightContain(String.valueOf(-1))) { 62 | System.out.println("mightContain"); 63 | } 64 | LocalTime now = LocalTime.now(); 65 | System.out.println(Duration.between(before, now).getNano()); 66 | 67 | } 68 | 69 | 70 | /** 71 | * 缓存击穿 72 | * 缓存在某个时间点过期的时候,恰好在这个时间点对这个Key有大量的并发请求过来, 73 | * 这些请求发现缓存过期一般都会从后端DB加载数据并回设到缓存,这个时候大并发的请求可能会瞬间把后端DB压垮。 74 | * 75 | * 与缓存穿透不同点在于,其是key实在db中存在的,不过某一时刻过期了导致不能够被获取到,请求就又转发到了db中。 76 | * 77 | * 与缓存雪崩不同的是,雪崩是大面积的key同时失效。 78 | * 79 | * 80 | * 1.使用分布式互斥锁的方式 解决缓存中找不到对应值的问题 81 | *

82 | * 简单说就是当检测到一个key失效时,对其使用分布式锁。用一个获取到锁的线程去加载数据,其他线程等在加载完成,锁被 83 | * 解除了才能够继续使用继续获取。 84 | *

85 | * 2.不由redis控制过期时间,由程序维护,无论有没有查询到数据都直接返回。在获得的数据时若发现超时,则由程序发起异步线程进行缓存更新。 86 | * 优点是不会产生死锁,缺点数据一致性较低 87 | */ 88 | private static String updateCache(String key) { 89 | String stop = "stop"; 90 | String value = jedis.get(key); 91 | if (value == null) { 92 | if (jedis.setnx(stop, "1") == 1) { 93 | System.out.println("已获取到锁,正在更新缓存"); 94 | jedis.expire(stop, 3 * 60); 95 | value = dbGet(key); 96 | jedis.set(key, value); 97 | System.out.println("缓存更新完成!!!"); 98 | jedis.del(stop); 99 | } else { 100 | try { 101 | Thread.sleep(50); 102 | } catch (InterruptedException e) { 103 | e.printStackTrace(); 104 | } 105 | System.out.println("当前已被加锁,准备重试"); 106 | value = updateCache(key); 107 | } 108 | 109 | } 110 | return value; 111 | } 112 | 113 | /** 114 | * 使用更好的锁方式实现,不推荐上边的加锁方式,存在线程不安全的问题 115 | * 116 | * @param key 117 | * @return 118 | */ 119 | private static String safeUpdateCache(String key) { 120 | String stop = "stop"; 121 | String lockId = UUID.randomUUID().toString(); 122 | String value = jedis.get(key); 123 | if (value == null) { 124 | //redis分布式锁 125 | if (DistributedTool.acquireDistributedLock(jedis, stop, lockId, Long.valueOf(180))) { 126 | System.out.println("已获取到锁,正在更新缓存"); 127 | value = dbGet(key); 128 | jedis.set(key, value); 129 | System.out.println("缓存更新完成!!!"); 130 | DistributedTool.releaseDistributedLock(jedis, stop, lockId); 131 | } else { 132 | try { 133 | Thread.sleep(50); 134 | } catch (InterruptedException e) { 135 | e.printStackTrace(); 136 | } 137 | System.out.println("当前已被加锁,准备重试"); 138 | value = safeUpdateCache(key); 139 | } 140 | 141 | } 142 | return value; 143 | } 144 | 145 | /** 146 | * 2. 该种方式就是模拟当获取不到值时,使用一个新的线程进行数据库值进行更新,问题是 147 | * 这种方式问题是数据一致性较低,当第一次获取时永远时返回为空。 148 | */ 149 | private static String getByTimeOut(String key) { 150 | String stop = "stop"; 151 | AtomicReference value = new AtomicReference<>(jedis.get(key)); 152 | String[] split = value.get().split("."); 153 | String relValue = split[0]; 154 | Long timeout = Long.valueOf(split[1]); 155 | if (timeout < LocalTime.now().getLong(ChronoField.NANO_OF_DAY)) { 156 | threadPool.execute(() -> { 157 | if (jedis.setnx(stop, "1") == 1) { 158 | System.out.println("已获取到锁,正在更新缓存"); 159 | jedis.expire(stop, 3 * 60); 160 | value.set(dbGet(key)); 161 | jedis.set(key, value.get()); 162 | System.out.println("缓存更新完成!!!"); 163 | jedis.del(stop); 164 | } 165 | }); 166 | } 167 | return value.get(); 168 | } 169 | 170 | private static String dbGet(String key) { 171 | try { 172 | Thread.sleep(3000); 173 | } catch (InterruptedException e) { 174 | e.printStackTrace(); 175 | } 176 | return map.get(key); 177 | } 178 | 179 | /** 180 | * 缓存雪崩 181 | * 182 | * 缓存雪崩是指在我们设置缓存时采用了相同的过期时间,导致缓存在某一时刻同时失效,请求全部转发到DB,DB瞬时压力过重雪崩。 183 | * 184 | * 185 | * 1.使用加锁或者队列的单线程方式,保证在有大量的数据失效时,不会有大量的并发请求发送到DB,导致压力过大 186 | * 187 | * 188 | * 2.在原有的过期时间上 随机添加一些时间,由于过期时间不同,就能减轻在过期时产生的压力 189 | */ 190 | } 191 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/DistributedTool.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis; 2 | 3 | import org.slf4j.Logger; 4 | import org.slf4j.LoggerFactory; 5 | import redis.clients.jedis.Jedis; 6 | 7 | import java.util.Collections; 8 | 9 | /** 10 | * 基于redis实现的分布式锁,当前只是针对单节点模式。 11 | *

12 | * redis集群情况下可以考虑使用redisson。 13 | */ 14 | public class DistributedTool { 15 | 16 | private static final Logger logger = LoggerFactory.getLogger(DistributedTool.class); 17 | 18 | private static final String LOCK_STATUS = "OK"; 19 | /** 20 | * 当key不存在时进行当前操作,若存在则不操作 21 | */ 22 | private static final String SET_IF_NOT_EXIST = "NX"; 23 | /** 24 | * 设定key的超时时间 25 | */ 26 | private static final String SET_WITH_EXPIRE_TIME = "PX"; 27 | 28 | /** 29 | * @param jedis redis客户端 30 | * @param key 使用key来当作锁,保证唯一性 31 | * @param lockId 要保证每次加锁和解锁是一个来自客户端,只用一个key是无法保证的,这里使用value值作为一次完整加锁解锁请求id来保证。 32 | * @param time 超时时间,设定了超时时间后,即使持有锁的客户端发生崩溃,key也会因为过期而自动删除,从而释放锁。 33 | * @return 加锁状态 34 | */ 35 | public static Boolean acquireDistributedLock(Jedis jedis, String key, String lockId, Long time) { 36 | String status = jedis.set(key, lockId, SET_IF_NOT_EXIST, SET_WITH_EXPIRE_TIME, time); 37 | if (LOCK_STATUS.equals(status)) { 38 | logger.info("获取锁成功,当前:" + key + "-" + lockId); 39 | return true; 40 | } 41 | return false; 42 | } 43 | 44 | 45 | /** 46 | * @param jedis redis客户端 47 | * @param key 锁 48 | * @param lockId 锁对应的请求id 49 | * @return 释放锁结果 50 | *

51 | * 使用lua脚本是为了保证操作的原子性,redisson中使用同样的方式进行锁释放。 52 | *

53 | * 为什么锁redis使用lua是线程安全的? 54 | * 因为redis本身就是单线程的,而redis内置了lua的解析器,从而能保证线程安全(不够严谨) 55 | */ 56 | public static Boolean releaseDistributedLock(Jedis jedis, String key, String lockId) { 57 | //lua脚本 58 | String luaScript = "if redis.call('get', KEYS[1]) == ARGV[1] then return redis.call('del', KEYS[1]) else return 0 end"; 59 | //调用evel交给redis服务端执行脚本 60 | Object status = jedis.eval(luaScript, Collections.singletonList(key), Collections.singletonList(lockId)); 61 | if (LOCK_STATUS.equals(status)) { 62 | logger.info("释放锁成功,当前:" + key + "-" + lockId); 63 | return true; 64 | } 65 | return false; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/lettuce/LettuceTools.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.lettuce; 2 | 3 | public enum LettuceTools { 4 | 5 | URL("redis://localhost:6379/0"); 6 | 7 | private String value; 8 | 9 | LettuceTools(String value){ 10 | this.value = value; 11 | } 12 | 13 | public String getValue() { 14 | return value; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/lettuce/MyListener.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.lettuce; 2 | 3 | import io.lettuce.core.pubsub.RedisPubSubListener; 4 | 5 | public class MyListener implements RedisPubSubListener { 6 | @Override 7 | public void message(Object channel, Object message) { 8 | 9 | } 10 | 11 | @Override 12 | public void message(Object pattern, Object channel, Object message) { 13 | 14 | } 15 | 16 | @Override 17 | public void subscribed(Object channel, long count) { 18 | 19 | } 20 | 21 | @Override 22 | public void psubscribed(Object pattern, long count) { 23 | 24 | } 25 | 26 | @Override 27 | public void unsubscribed(Object channel, long count) { 28 | 29 | } 30 | 31 | @Override 32 | public void punsubscribed(Object pattern, long count) { 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/lettuce/PubSubByLettuce.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.lettuce; 2 | 3 | import io.lettuce.core.RedisClient; 4 | import io.lettuce.core.RedisFuture; 5 | import io.lettuce.core.cluster.RedisClusterClient; 6 | import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection; 7 | import io.lettuce.core.cluster.pubsub.api.async.RedisClusterPubSubAsyncCommands; 8 | import io.lettuce.core.cluster.pubsub.api.sync.RedisClusterPubSubCommands; 9 | import io.lettuce.core.pubsub.StatefulRedisPubSubConnection; 10 | import io.lettuce.core.pubsub.api.async.RedisPubSubAsyncCommands; 11 | import io.lettuce.core.pubsub.api.reactive.RedisPubSubReactiveCommands; 12 | import io.lettuce.core.pubsub.api.sync.RedisPubSubCommands; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | /** 17 | * 18 | */ 19 | public class PubSubByLettuce { 20 | 21 | private static Logger logger = LoggerFactory.getLogger(PubSubByLettuce.class); 22 | 23 | public static void main(String[] args) { 24 | RedisClient redisClient = RedisClient.create(LettuceTools.URL.getValue()); 25 | 26 | /** 27 | * 同步订阅 synchronous subscription 28 | */ 29 | StatefulRedisPubSubConnection pubSub = redisClient.connectPubSub(); 30 | 31 | pubSub.addListener(new MyListener()); 32 | 33 | RedisPubSubCommands sync = pubSub.sync(); 34 | 35 | sync.subscribe("channel"); 36 | 37 | 38 | /** 39 | * 异步订阅 asynchronous subscription 40 | */ 41 | StatefulRedisPubSubConnection pubSub1 = redisClient.connectPubSub(); 42 | 43 | pubSub1.addListener(new MyListener()); 44 | 45 | RedisPubSubAsyncCommands async = pubSub1.async(); 46 | 47 | //异步将会返回future 48 | RedisFuture future = async.subscribe("channel"); 49 | 50 | future.whenComplete((s,th)->{ 51 | if (th instanceof Exception){ 52 | logger.info(th.getMessage()); 53 | } 54 | }); 55 | 56 | 57 | /** 58 | * 使用reactive订阅 59 | */ 60 | 61 | StatefulRedisPubSubConnection pubSub2 = redisClient.connectPubSub(); 62 | 63 | RedisPubSubReactiveCommands reactive = pubSub2.reactive(); 64 | 65 | reactive.subscribe("channel").subscribe(); 66 | 67 | //将会接收到所有进来的消息,可以进行过滤操作,observe会在取消订阅时停止。 68 | reactive.observeChannels().doOnNext(message-> logger.info(message.getMessage())).subscribe(); 69 | 70 | 71 | /** 72 | * 在redis集群中使用订阅功能 73 | * 74 | * 在redis集群中可以是用订阅,但有几点需要注意: 75 | * 76 | * 用户在集群的一个节点上发布消息,集群会自动向所有节点广播,不论这台机器是否订阅了 77 | * 这个channel,这也就表示在集群中订阅消息时,不需要连接指定的消息发布的节点,任意 78 | * 任意一个节点都可以。 79 | */ 80 | 81 | 82 | RedisClusterClient clusterClient = RedisClusterClient.create(LettuceTools.URL.getValue()); 83 | 84 | StatefulRedisClusterPubSubConnection connection = clusterClient.connectPubSub(); 85 | 86 | //同步 87 | RedisClusterPubSubCommands sync1 = connection.sync(); 88 | 89 | sync1.subscribe("channel"); 90 | 91 | //异步 92 | RedisClusterPubSubAsyncCommands async1 = connection.async(); 93 | 94 | async1.subscribe("channel"); 95 | 96 | 97 | 98 | 99 | StatefulRedisClusterPubSubConnection connection0 = clusterClient.connectPubSub(); 100 | connection0.addListener(new MyListener()); 101 | connection0.setNodeMessagePropagation(true); 102 | RedisClusterPubSubCommands sync2 = connection0.sync(); 103 | sync2.masters().commands().subscribe("__keyspace@0__:*"); 104 | 105 | 106 | /** 107 | * 注意事项 108 | * 109 | * 复制到副本节点的键,特别是考虑到到期,会在保存该键的所有节点上生成键空间事件。如果一个密钥过期并被复制,它将在主副 110 | * 本和所有副本上过期。每个redis服务器都会发出keyspace事件。因此,订阅非主节点将使您的应用程序看到同一个密钥的同一类 111 | * 型的多个事件,因为redis是分布式的。 112 | * 113 | * 订阅可以通过使用nodeselection api或对单个集群节点连接调用subscribe(…)来发出。订阅注册不会传播到拓扑更改时添 114 | * 加的新节点。 115 | */ 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/lettuce/QuickStartByLettuce.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.lettuce; 2 | 3 | import io.lettuce.core.LettuceFutures; 4 | import io.lettuce.core.RedisClient; 5 | import io.lettuce.core.RedisFuture; 6 | import io.lettuce.core.api.StatefulRedisConnection; 7 | import io.lettuce.core.api.async.RedisAsyncCommands; 8 | import io.lettuce.core.api.sync.RedisCommands; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.util.concurrent.ExecutionException; 13 | import java.util.concurrent.TimeUnit; 14 | import java.util.concurrent.TimeoutException; 15 | import java.util.stream.IntStream; 16 | 17 | public class QuickStartByLettuce { 18 | private static final Logger logger = LoggerFactory.getLogger(QuickStartByLettuce.class); 19 | 20 | private final static String URI = "redis://localhost:6379/0"; 21 | 22 | public static void main(String[] args) throws InterruptedException { 23 | 24 | /** 25 | * 同步方式 26 | */ 27 | 28 | logger.info("使用同步API执行命令"); 29 | //创建RedisClient实例 30 | RedisClient redisClient = RedisClient.create(URI); 31 | 32 | //创建redis连接 33 | StatefulRedisConnection connect = redisClient.connect(); 34 | 35 | //获取用于同步执行的命令API。lettuce也支持异步(async())和反应式执行模型(reactive())。 36 | //返回RedisCommands 37 | RedisCommands redisCommands = connect.sync(); 38 | 39 | //可直接使用redis的command 40 | redisCommands.set("test", "value"); 41 | 42 | logger.info(redisCommands.get("test")); 43 | 44 | 45 | /** 46 | * 需要手动关闭连接,连接默认设计为长连接且线程安全 47 | * 当前链接失效时会自动重连,一直到close()被调用 48 | */ 49 | connect.close(); 50 | 51 | 52 | /** 53 | * 异步方式 54 | */ 55 | 56 | logger.info("使用异步API执行命令"); 57 | StatefulRedisConnection aconnect = redisClient.connect(); 58 | 59 | RedisAsyncCommands asyncCommands = aconnect.async(); 60 | 61 | asyncCommands.set("async", "command"); 62 | 63 | //在lettuce中使用异步API执行command将会返回RedisFuture, 64 | //他是继承自CompletionStage,可以取消(cancel()),也可以查询执行状态(isDone(),isCancelled()) 65 | RedisFuture async = asyncCommands.get("async"); 66 | 67 | try { 68 | 69 | //可以从RedisFuture获取到返回的结果。 70 | logger.info(async.get()); 71 | 72 | //将会等待10秒,再去获取RedisFuture返回的值 73 | //超时将会抛出TimeoutException 74 | logger.info(async.get(5, TimeUnit.SECONDS)); 75 | 76 | //有结果返回时将会调用。 77 | async.thenAccept(s -> logger.info(s)); 78 | 79 | //有结果返回后,使用异步线程执行 80 | async.thenAcceptAsync(s -> logger.info("返回后使用异步线程执行")); 81 | 82 | } catch (InterruptedException e) { 83 | logger.error(e.getMessage()); 84 | } catch (ExecutionException e) { 85 | logger.error(e.getMessage()); 86 | } catch (TimeoutException e) { 87 | logger.error(e.getMessage()); 88 | } 89 | 90 | try { 91 | Thread.sleep(5000); 92 | } catch (InterruptedException e) { 93 | e.printStackTrace(); 94 | } 95 | 96 | 97 | /** 98 | * 同步使用future,暂未完成 99 | */ 100 | 101 | logger.info("代码不会等到某个命令完成后再发出另一个命令。同步是在发出所有命令之后完成的。"); 102 | LettuceFutures.awaitAll(1, TimeUnit.MINUTES, IntStream.range(0, 10).mapToObj(i -> asyncCommands.set("key-" + i, "value-" + i)).toArray(RedisFuture[]::new)); 103 | 104 | 105 | logger.info("对单个futur也可以使用await"); 106 | RedisFuture future = asyncCommands.get("key-0"); 107 | if (!future.await(1, TimeUnit.MINUTES)) { 108 | System.out.println("在超时时间内未完成!"); 109 | } 110 | 111 | 112 | logger.info("还有一种使用阻塞future的是采用循环的方式"); 113 | RedisFuture future1 = asyncCommands.get("key-1"); 114 | while (!future1.isDone()) { 115 | logger.info("当前查询任务还未完成,继续阻塞"); 116 | } 117 | 118 | 119 | /** 120 | * 错误处理 121 | * 122 | * 1.返回默认值 123 | * 2.使用备用的future 124 | * 3.重试future 125 | */ 126 | 127 | //可以使用handle函数在出现异常时返回默认值 128 | future1.handle((s, throwable) -> { 129 | if (throwable != null) { 130 | return "default value"; 131 | } 132 | return s; 133 | }).thenAccept(s -> logger.info("获取到的value为:{}", s)); 134 | 135 | 136 | //future支持可以根据不同的返回异常的类型,使用不同的默认值 137 | future1.exceptionally(throwable -> { 138 | if (throwable instanceof IllegalStateException) { 139 | return "IllegalStateException"; 140 | } else if (throwable instanceof ExecutionException) { 141 | return "ExecutionException"; 142 | } 143 | return "default value"; 144 | }).thenAccept(s -> logger.info("当前返回值为:{}", s)); 145 | 146 | 147 | // 148 | future1.whenComplete((s, throwable) -> { 149 | if (throwable instanceof IllegalStateException) { 150 | logger.error("异常为:{}", throwable.getMessage()); 151 | } 152 | }).thenAccept(s -> logger.info("当前value:{}", s)); 153 | 154 | 155 | //关闭实例,释放线程和资源。 156 | redisClient.shutdown(); 157 | } 158 | 159 | 160 | } 161 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/lettuce/TransactionsByLettuce.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.lettuce; 2 | 3 | import io.lettuce.core.KeyValue; 4 | import io.lettuce.core.RedisClient; 5 | import io.lettuce.core.RedisFuture; 6 | import io.lettuce.core.TransactionResult; 7 | import io.lettuce.core.api.async.RedisAsyncCommands; 8 | import io.lettuce.core.api.reactive.RedisReactiveCommands; 9 | import io.lettuce.core.api.sync.RedisCommands; 10 | import lombok.extern.slf4j.Slf4j; 11 | 12 | import java.util.List; 13 | import java.util.concurrent.ExecutionException; 14 | 15 | @Slf4j 16 | public class TransactionsByLettuce { 17 | 18 | public static void main(String[] args) throws ExecutionException, InterruptedException { 19 | RedisClient client = RedisClient.create(LettuceTools.URL.getValue()); 20 | 21 | 22 | /** 23 | * 24 | * Transactions using the asynchronous API 25 | * 26 | * 与非事务方式接近,同样是返回RedisFuture,可以对这个返回的future使用与 27 | * 非事务方式同样的操作 28 | */ 29 | RedisAsyncCommands async = client.connect().async(); 30 | 31 | async.multi(); 32 | 33 | async.set("key3", "value3"); 34 | 35 | RedisFuture set = async.get("key5"); 36 | 37 | RedisFuture future = async.exec(); 38 | 39 | TransactionResult objects = future.get(); 40 | log.info("第一次返回为{}, 第二次返回为{}", set.get(), objects.get(1)); 41 | if (objects.get(0) == set.get()) { 42 | log.info("结果相同"); 43 | } 44 | 45 | 46 | /** 47 | * Transactions using the reactive API 48 | * 49 | * 使用react api可以在一步执行多个命令 50 | * 51 | * 以下代码启动事务,在事务中执行两个命令,最后执行事务 52 | */ 53 | 54 | RedisReactiveCommands reactive = client.connect().reactive(); 55 | 56 | reactive.multi().subscribe(multiResponse -> { 57 | reactive.set("key", "1").subscribe(); 58 | reactive.incr("key").subscribe(); 59 | reactive.exec().subscribe(); 60 | }); 61 | 62 | /** 63 | * Transactions on clustered connections 64 | * 65 | * 默认情况下,集群会自动路由,意味着你不能确定你的命令是在 66 | * 那一台节点上执行的,所以当执行在集群环境时,使用普通的事务 67 | * 命令即可。 68 | */ 69 | 70 | RedisCommands redis = client.connect().sync(); 71 | redis.multi(); 72 | redis.set("one", "1"); 73 | redis.set("two", "2"); 74 | redis.mget("one", "two"); 75 | redis.llen("key"); 76 | 77 | redis.exec(); // result: list("OK", "OK", list("1", "2"), 0L) 78 | 79 | 80 | /** 81 | * Mult executing multiple asynchronous commands 82 | */ 83 | RedisAsyncCommands async1 = client.connect().async(); 84 | async1.multi(); 85 | RedisFuture set1 = async1.set("one", "1"); 86 | RedisFuture set2 = async1.set("two", "2"); 87 | RedisFuture>> mget = async1.mget("one", "two"); 88 | RedisFuture llen = async1.llen("key"); 89 | 90 | set1.thenAccept(value -> log.info(value)); // OK 91 | 92 | RedisFuture exec = async1.exec(); // result: list("OK", "OK", list("1", "2"), 0L) 93 | exec.thenAccept(value -> log.info(value.get(0))); 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/redisson/RedissonDelayQueue.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.redisson; 2 | 3 | import org.redisson.Redisson; 4 | import org.redisson.api.RBlockingQueue; 5 | import org.redisson.api.RDelayedQueue; 6 | import org.redisson.api.RedissonClient; 7 | import org.redisson.config.Config; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.util.concurrent.TimeUnit; 12 | 13 | /** 14 | * Title: RedissonDelayQueue.java 15 | * description: TODO 16 | * 17 | * @author sev7e0 18 | * @version 1.0 19 | * @since 2021-01-25 11:56 20 | **/ 21 | 22 | public class RedissonDelayQueue { 23 | 24 | static final Logger logger = LoggerFactory.getLogger(RedissonDelayQueue.class); 25 | 26 | public static void main(String[] args) throws InterruptedException { 27 | final Config config = new Config(); 28 | config.useSingleServer().setAddress("redis://localhost:6379/0"); 29 | final RedissonClient redissonClient = Redisson.create(config); 30 | 31 | 32 | for (int i = 0; i < 100; i++) { 33 | final RBlockingQueue blockingQueue = redissonClient.getBlockingQueue("delay_queue"); 34 | 35 | final RDelayedQueue delayedQueue = redissonClient.getDelayedQueue(blockingQueue); 36 | final String s = "obj test666:" + i; 37 | delayedQueue.offer(s, 100-i, TimeUnit.SECONDS); 38 | logger.info("消息入队,内容:{},时间 :{}", s, i); 39 | delayedQueue.destroy(); 40 | } 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/redisson/RedissonDelayQueueConsumer.java: -------------------------------------------------------------------------------- 1 | package com.tools.redis.redisson; 2 | 3 | import org.redisson.Redisson; 4 | import org.redisson.api.RBlockingQueue; 5 | import org.redisson.api.RDelayedQueue; 6 | import org.redisson.api.RedissonClient; 7 | import org.redisson.config.Config; 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import java.util.Objects; 12 | import java.util.concurrent.TimeUnit; 13 | 14 | /** 15 | * Title: RedissonDelayQueue.java 16 | * description: TODO 17 | * 18 | * @author sev7e0 19 | * @version 1.0 20 | * @since 2021-01-25 11:56 21 | **/ 22 | 23 | public class RedissonDelayQueueConsumer { 24 | 25 | static final Logger logger = LoggerFactory.getLogger(RedissonDelayQueueConsumer.class); 26 | 27 | public static void main(String[] args) throws InterruptedException { 28 | final Config config = new Config(); 29 | config.useSingleServer().setAddress("redis://localhost:6379/0"); 30 | final RedissonClient redissonClient = Redisson.create(config); 31 | 32 | final RBlockingQueue blockingQueue = redissonClient.getBlockingQueue("delay_queue"); 33 | 34 | while (true) { 35 | final String poll = blockingQueue.poll(2, TimeUnit.SECONDS); 36 | if (Objects.isNull(poll)) { 37 | continue; 38 | } 39 | logger.info("消息出队队,内容:{}", poll); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/redis持久化详解.md: -------------------------------------------------------------------------------- 1 | # redis持久化详解 2 | 3 | 在redis中提供了两种持久化方式RDB(快照)和AOF,不过两种各有利弊,接下来详细说一下。 4 | 5 | ## RDB持久化方式 6 | 7 | ![20190820213301571.png](https://files.sev7e0.site/images/oneblog/20190820213301571.png) 8 | 9 | RBD持久化是redis的默认策略。 10 | 11 | redis会生成一个二进制文件,默认情况下dump.rdb会叫这个名字,但这个文件是可以被redis还原成文件生成时redis的状态。 12 | 13 | redis支持两个命令生成RDB文件,`SAVE`和`BGSAVE`,主要区别在一前者为阻塞方式生成文件。 14 | 后者则是以子进程(fork)的方式进行生成文件,也就以为这不会产生阻塞,父进程可以继续响应请求,这里主要介绍`BGSAVE`该种方式,应为第一种方式将会阻塞,在大量数据的情况下,服务将尝试建不可用。 15 | 16 | #### fork 17 | 18 | Linux 提供的一种进程机制,当前进程调用 fork 将会产生一个子进程,该子进程会与当前进程共享一块内存,也就是数据段和代码段都是相同的。 19 | 20 | 子进程在在得到了内存后,开始疯狂的写入文件做持久化,此时若当前进程接受到新的请求,进行数据更改时,将会把共享的内存段复制一份,当前进程会基于新的内存数据进行操作。 21 | 这样一来,我们就不用再去担心当前进程对即将产生的快照产生影响了。 22 | 23 | 24 | ** 若在BGSAVE执行期间,手动再次调用`SAVE`、`BGSAVE`或`BGREWRITEAOF`命令会产生两次备份么?** 25 | 26 | redis规定在备份命令执行期间,再次执行以上命令则不会被执行,为了防止产生竞争同时执行两个`rdbSave`,`SAVE`、`BGSAVE`两个命令不会执行。 27 | 而`BGREWRITEAOF`命令会在,前一个命令执行完成后开始执行。 28 | 29 | 但如果是`BGREWRITEAOF`命令正在执行,此时客户端发送`BGSAVE`命令则会被拒绝。 30 | 31 | 具体原因是两个命令都是由子进程执行的,所以在操作方面没有冲突的地方,不能同时执行的原因是性能上的考虑——并发出两个子进程,并且这两个子进程都会同时执行大量 io(磁盘写入)操作。 32 | 33 | ## AOF持久化方式 34 | 35 | ![20190820213255264.jpg](https://files.sev7e0.site/images/oneblog/20190820213255264.jpg) 36 | 37 | AOF 持久化在 Redis 中默认为关闭,他类似于 Mysql 的 Binlog,是一个用来记录 Redis 所有操作命令的日志,若在 redis 启动时就开启了AOF,那么他将记录所有的操作命令,在进行恢复时只需要进行 AOF 的命令重放即可。 38 | 39 | 正因为他记录了所有的操作命令,所以他也存在一些问题: 40 | - 随着 AOF 记录的操作命令越来越多导致生成的文件很大。 41 | - 由于生成的文件大,导致在进行命令重放的时候时间较长。 42 | 43 | redis 中 AOF 的写操作是在逻辑处理之后,导致 redis 无法进行数据的回滚,这也是与 MySQL 的 binlog不同的一点。 44 | 45 | bgrewriteaof:针对上述的问题,Redis 在 2.4 之后也使用了 bgrewriteaof 对 AOF 日志进行瘦身。 46 | 47 | bgrewriteaof 命令用于异步执行一个 AOF 文件重写操作。重写会创建一个当前 AOF 文件的体积优化版本。 48 | 49 | ## RDB && AOF 混合持久化方式 50 | 51 | 由于两种方式都存在一些问题,使用 RDB 会导致在写入时丢失新写入的数据,而使用 AOF 会存在持久化文件过大导致恢复时停机时间较长的问题。 52 | 53 | 所以在Redis4.0之后引入了新的持久化机制,将 RDB 与 AOF 进行结合,使用 RDB 策略进行持久化,同时将这段时间内的操作使用 AOF 进行记录,这样既能够快速的生成文件,同时 AOF 不在需要进行全量的操作记录,只需要保存前一次 RDB 开始后的增量 AOF 即可,这样生成的 AOF 持久化文件将不会再过大。 -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/redis的五大数据类型.md: -------------------------------------------------------------------------------- 1 | ## redis五种数据类型 2 | 3 | ### 字符串(string) 4 | 5 | **字符串有哪些常用的命令?** 6 | 7 | | op | 注释 | 8 | |---|---| 9 | | APPEND | 将value值追加到给定的key当前对应的value的末尾 | 10 | | GETRANGE | 获取一个给定范围内的字符串 | 11 | | SETRANGE | 将指定的位置开始设定为给定值 | 12 | | GETBIT | 将字符串看成二进制串,返回偏移量在子串中所对应的值 | 13 | | SETBIT | 将字符串看成二进制串,设定偏移量对应子串中的位置为给定值 | 14 | | BITCOUNT | 统计二进制子串中位置为1的数量,可选择指定的区间 | 15 | | BITSTOP | 可以对二进制子串进行逻辑运算,并将结果保存到新的key-value中 | 16 | 17 | **实现原理** 18 | 19 | 底层由redis实现的简单动态字符串(SDS)实现的 20 | 21 | 相较于c语言的字符串的有几个优点: 22 | - 获取字符串的长度的时间复杂度为O(1) 23 | - API安全不会造成缓冲区溢出 24 | - 修改字符串时最多需要n次内存分配 25 | - 可以同时保存文本和二进制数 26 | - 可以使用c中原来的库函数 27 | 28 | 支持存储三种类型的值: 29 | - 字符串 30 | - 整数 31 | - 浮点数 32 | 33 | **redis在字符串方面有哪些不同于其他数据库?** 34 | 35 | 很多键值数据库只能将数据存储为普通的字符串,并且不提供字符串处理操作,有一些数据库虽 36 | 然支持简单的追加,但却不可以像redis一样对字符串的子串进行读写(GETRANGE)。 37 | 38 | **redis的字符串是如何保存整数和浮点数的?** 39 | 40 | redis字符串底层使用的也是字符串数组,所以在保存时可以使用整数和浮点数,并且他会自动 41 | 识别出你保存的是整数还是浮点数,还是字符串,如果是浮点数整数,他将会支持使用自增或 42 | 自减等操作。 43 | 44 | ### 列表(list) 45 | 46 | **列表的常用命令** 47 | 48 | | op | 注释 | 49 | |---|---| 50 | | RPUSH | 在存储在键的列表尾部插入所有指定的值。如果键不存在,则在执行推送操作之前将其创建为空列表。 | 51 | | LPUSH | 在存储在键的列表头部插入所有指定的值。如果键不存在,则在执行推送操作之前将其创建为空列表。 | 52 | | LPOP | 从列表的头部开始推出,非阻塞的 | 53 | | RPOP | 从列表的头部开始推出,非阻塞的 | 54 | | RPOPLPUSH | 原子地返回并删除存储在源位置的列表的最后一个元素(尾),并将该元素推送到存储在目标位置的列表的第一个元素(头) | 55 | | LINSERT | 尾部插入 | 56 | | LINDEX | 获取到对应索引的value值,和普通链表一样,索引从0开始 | 57 | | BLPOP、BRPOP |一个阻塞列表pop。lpop和rpop的阻塞版本,因为当没有元素从任何给定的列表中弹出时,它会阻塞连接。按照给定的顺序检查给定的键。| 58 | | BRPOPLPUSH | 阻塞的RPOPLPUSH实现,当列表中不为空时,其功能与RPOPLPUSH完全一样,但当为空时则会阻塞,直到其他的客户端将数据放入进来(可使用无限期阻塞),或者超时才会返回。 | 59 | | LLEN | 获取当前列表的长度 | 60 | 61 | **~~在redis3.2之前的版本list实现~~** 62 | 63 | 列底层使用了压缩列表和双向链表来实现的,在列表中对象较少时,会使用压缩列表,随着包含 64 | 的对象越来越多时, 65 | 将会逐渐转换为性能等方面更好的更适合处理大量元素的双端链表(关于压缩列表和双端链表可 66 | 查阅《redis的设计与实现》)。 67 | ``` 68 | redis:6379> RPUSH zip a b c de 12 23 45 "dd" 69 | (integer) 8 70 | redis:6379> OBJECT encoding zip 71 | "ziplist" 72 | ``` 73 | 但从redis3.2开始将不会在看当这样的返回。 74 | 75 | **在redis3.2之后的版本list实现** 76 | 77 | 列表底层使用了一种数据结构实现[quick list][1],`quicklist`是一个双向链表,不过这 78 | 个双向链表的节点使用的则是`ziplist`,如果了解过`ziplist`那将会知道,它是一个内存 79 | 紧凑的数据结构,其中的每一个数据项前后相邻,并且能够维持数据项的先后顺序。 80 | 81 | **为什么要使用quicklist这种数据结构** 82 | 83 | - 双向链表便于在表的两端进行push和pop操作,但是它的内存开销比较大。首先,它在每个节点 84 | 上除了要保存数据之外,还要额外保存两个指针;其次,双向链表的各个节点是单独的内存块, 85 | 地址不连续,节点多了容易产生内存碎片。 86 | - ziplist由于是一整块连续内存,所以存储效率很高。但是,它不利于修改操作,每次数据变动 87 | 都会引发一次内存的realloc。特别是当ziplist长度很长的时候,一次realloc可能会导致大 88 | 批量的数据拷贝,进一步降低性能。 89 | 90 | **一个quicklist节点包含多长的ziplist才能在空间和时间上达到最优?** 91 | 92 | - 每个quicklist节点上的ziplist越短,则内存碎片越多。内存碎片多了,有可能在内存中产 93 | 生很多无法被利用的小碎片,从而降低存储效率。这种情况的极端是每个quicklist节点上的 94 | ziplist只包含一个数据项,这就蜕化成一个普通的双向链表了。 95 | - 每个quicklist节点上的ziplist越长,则为ziplist分配大块连续内存空间的难度就越大。 96 | 有可能出现内存里有很多小块的空闲空间(它们加起来很多),但却找不到一块足够大的空闲空间 97 | 分配给ziplist的情况。这同样会降低存储效率。这种情况的极端是整个quicklist只有一个节点, 98 | 所有的数据项都分配在这仅有的一个节点的ziplist里面。这其实蜕化成一个ziplist了。 99 | 100 | 由此可见,每个quicklist节点的ziplist要保持多长,这可能要等到具体的使用场景才能够决定。 101 | `list-max-ziplist-size`可以进行ziplist的size配置。当列表很长时,可以使用 102 | `list-compress-depth`进行中间段压缩。 103 | 104 | ### 散列(hash) 105 | 106 | **常用命令** 107 | 108 | `OP hash field value` 109 | 110 | | op | 注释 | 111 | |---|---| 112 | | HSET | 设置哈希表的值不存在则创建,存在覆盖,不存在时写入成功返回1,存在时覆盖成功时返回0 113 | | HSETNX | 当哈希表的filed不存在时创建,field存在时放弃操作,当hash表不存在时则创建hash表,再次执行命令HSETNX,成功创建filed返回1,放弃返回0 114 | | HGET | 根据给定的哈希表和给定的filed查询出value值 115 | | HEXISTS | 检查给定的filed是否存在于哈希表中 116 | | HDEL | 删除指定filed 117 | | HLEN | 哈希表的长度,就是filed的数量 118 | | HSTRLEN | 返回哈希表 key 中, 与给定域 field 相关联的值的字符串长度(string length)。 119 | | HINCRBY | filed值自增 120 | | HMSET | 同HSET,支持多个value 121 | | HMGET | 同HGET,支持多个value 122 | | HKEYS | 获取指定filed的所有key 123 | | HVALS | 获取指定filed的所有value 124 | | HGETALL | 同时获取filed下所有的key-value 125 | 126 | **实现方式** 127 | 128 | 散列的底层提供了两种实现方式,`ziplist`和`hashtable`,在一定条件下两种方式会发生相 129 | 互转换,当散列表较小时,默认使用的时`ziplist`,当一个filed存储过多的key-value时会 130 | 转而使用`hashtable`。 131 | 132 | **ziplist如何实现hash** 133 | 134 | ziplist使用entry保存每一对键值对,当有新的加入进来时,key会先放到压缩列表的的尾部,然后再 135 | 将value放到尾部,保证每一个key和value时紧挨着的,这样先放入的键值对会存在压缩列表的头部, 136 | 后方进来的会保持在尾部。 137 | 138 | **hashtable如何实现的hash** 139 | 140 | hashtable实现hash使用的时字典进行保存键值对,字典的键保存键值对的键,值保存键值对的 141 | 值。字典中的键和值都是用字符串对象。 142 | 143 | **什么情况下会使他们发生转换** 144 | 145 | - hash对象中保存的键值对的键和值的字符串长度都小于64 146 | - hash对象中保存的键值对少于512个 147 | 148 | 满足以上两点键会使用ziplist,反之将会转化为hashtable,不过量值不是固定的,可以通过 149 | 配置文件进行修改,`hash-max-ziplist-value` and `hash-max-ziplist-entries`。 150 | 151 | ### 集合(Set) 152 | 153 | **常用命令** 154 | 155 | `OP key member [member ...]` 156 | 157 | | op | 注释 | 158 | |---|---| 159 | |SADD|向集合中添加一个元素,member已经存在则会被忽略,key不存在将会被创建 160 | |SISMEMBER|判断member是否为集合中的成员,是返回1其他情况返回0 161 | |SPOP|随机移除一个元素 162 | |SRANDMEMBER|只提供 key 参数时,返回随机一个元素;如果集合为空,返回nil,如果提供了count参数,那么返回一个数组;如果集合为空,返回空数组。 163 | |SREM|移除一个或者多个元素 164 | |SMOVE|原子性操作,移动原集合中的member到目标集合中,目标中存在这是单纯的将member移除。 165 | |SCARD|返回集合中的数量 166 | |SMEMBERS|返回结合中的所有成员 167 | |SINTER|返回一个集合的全部成员,该集合是所有给定集合的交集。不存在的 key 被视为空集。 168 | |SUNION|返回一个集合的全部成员,该集合是所有给定集合的并集。 169 | |SDIFF|返回一个集合的全部成员,该集合是所有给定集合之间的差集。 170 | |SDIFFSTORE|与SDIFF相识,但它将结果保存到 destination 集合,而不是简单地返回结果集。 171 | 172 | **实现方式** 173 | 174 | 集合的底层实现是由intset和hashtable实现,使用整数集合时所有元素都被放在集合里面, 175 | 使用hashtable时,将会被保存在字典中,字典的key将会保存每一个元素,字典的value值将会被 176 | 置null。 177 | 178 | **何时发生转换** 179 | 180 | - 当集合中所有元素都是整数时 181 | - 当集合中保存的数量超过512时 182 | 183 | 同时满足以上两点那么redis将会使用intset保存集合中的元素。同样这个是可配置的使用用 184 | `set-max-intset-entries`进行配置。 185 | 186 | ### 有序集合(sort set) 187 | 188 | **常用命令** 189 | 190 | | op | 注释 | 191 | |---|---| 192 | |ZADD | 将member及其score放入到有序key的集合中,如果某个 member 已经是有序集的成员,那么更新这个 member 的 score 值,并通过重新插入这个 member 元素,来保证该 member 在正确的位置上。 193 | |ZSCORE | 返回有序集 key 中,成员 member 的 score 值。 194 | |ZCARD | 返回有序集 key 的基数。 195 | |ZCOUNT | 返回有序集 key 中, score 值在 min 和 max 之间(默认包括 score 值等于 min 或 max )的成员的数量。 196 | |ZRANGE | 返回有序集 key 中,指定区间内的成员。 197 | |ZREVRANGE | 返回有序集 key 中,指定区间内的成员,逆序排列。 198 | |ZRANGEBYSCORE | 返回有序集 key 中,指定区间内的成员。 199 | |ZREVRANGEBYSCORE | 返回有序集 key 中,所有 score 值介于 min 和 max 之间(包括等于 min 或 max )的成员。有序集成员按 score 值逆序。 200 | |ZRANK | 返回有序集 key 中成员 member 的排名。其中有序集成员按 score 值递增(从小到大)顺序排列。 201 | |ZREVRANK | 返回有序集 key 中成员 member 的排名。其中有序集成员按 score 值逆序。 202 | |ZREM | 移除有序集 key 中的一个或多个成员,不存在的成员将被忽略。key存在但不是有序集合时将会报错 203 | |ZREMRANGEBYRANK | 移除有序集 key 中范围内的成员,不存在的成员将被忽略。按照rank的排序 204 | |ZREMRANGEBYSCORE | 移除有序集 key 中的范围内的成员,不存在的成员将被忽略。按照score排序 205 | |ZUNIONSTORE | 交集,并将结果存储到新的有序集合中 206 | 207 | **实现方式** 208 | 209 | 有序集合内部由压缩列表、字典和[跳表][2]实现的。 210 | - 在ziplist实现中,每个集合元素使用两个紧挨在一起的压缩列表实现,第一个节点保存元素的 211 | 成员,第二个保存元素的score,内部按照score的大小进行排列,score大的放在靠近表尾,小 212 | 的放在表头。 213 | - 在skiplist的实现中,使用zset作为地层结构,每个zset包含了一个字典和一个跳表。在跳表中 214 | 节点的object属性保存了元素的成员,而跳表的score属性保存了有序集合元素的score。在字典中 215 | 每个字典的key将会保存元素,value将会用来保存score,这样就创建了一个元素到score的映射 216 | ,加快`zscore`的速度。虽然在redis的有序集合skiplist实现中同时使用了两种数据结构, 217 | 不过两种结构时对象项共享的,也就是锁元素的String对象和score的float对象都是被共享的, 218 | 所以不会产生内存浪费这种现象。 219 | 220 | **为什么要同时使用两种数据结构实现有序集合?** 221 | 222 | 源码中的注释大概意思就是,为了效率。并且明确的指出了两种数据结构使用的是共享SDS,也就 223 | 是说redis在管理一个字符串时另一个也会被影响。至于为什么使用两种,可以这样理解,若单独使用 224 | 字典来实现,那以O(1)的时间获取指定元素的score将会被保持,但是`ZRANGE`使用这样的范围型 225 | 操作时,由于字典无序,那么也就是说每次获取前都要进行排序,至少需要O(log(n))。同样若单 226 | 独使用跳表实现,那么每次查找元素对应的score将会花费O(log(n))。所以为了让有序集合的查找 227 | 和范围操作快速执行,redis使用了两种数据结构。 228 | 229 | **何时发生转换** 230 | 231 | - 当有序集合中元素数量小于128 232 | - 当有序集合中每个元素的长度小于64 233 | 234 | 同时满足以上两点那么redis将会使用ziplist保存集合中的元素。反之使用skiplist同样这个是可配置的使用用 235 | `zset-max-ziplist-entries`和`zset-max-ziplist-value`进行配置。 236 | 237 | [1]:http://zhangtielei.com/posts/blog-redis-quicklist.html 238 | [2]:http://zhangtielei.com/posts/blog-redis-skiplist.html -------------------------------------------------------------------------------- /src/main/java/com/tools/redis/分布式数据库与缓存双写一致性方案.md: -------------------------------------------------------------------------------- 1 | 2 | ### 分布式数据库与缓存双写一致性方案 3 | 4 | #### ~~先更新数据库,在更新缓存~~ 5 | -------------- 6 | 7 | 该方案不推荐使用,主要原因有两点: 8 | 1. 原因一:线程安全问题 9 | 若同时有AB两个线程进行更新,则会出现数据不一致的问题 10 | 1. A更新了数据库 11 | 2. B更新了数据库 12 | 3. B更新了缓存 13 | 4. A更新了缓存 14 | 2. 原因二:业务场景 15 | 1. 若数据库的写操作比较多,读操作较少的话,会导致缓存频繁更新,浪费性能。 16 | 2. 若写操作并不是直接将数据写入缓存,有其他计算操作的话,那需要每次更新数据库都要重新计算。 17 | 18 | #### ~~先删除缓存,再更新数据库~~ 19 | ------------------ 20 | 21 | 该方案在多线程的条件下同样会存在数据不一致的问题 22 | 1. 请求A进行写操作,删除缓存; 23 | 2. 请求B查询发现缓存不存在; 24 | 3. 请求B去数据库查询得到旧值; 25 | 4. 请求B将旧值写入缓存; 26 | 5. 请求A将新值写入数据库。 27 | 该方案如果不设置超时时间的话,那么在下一次写操作前,缓存中都为脏数据。 28 | 29 | 这里可以采用延迟两遍删除的策略来保证。 30 | ```jshelllanguage 31 | public void write(String key,Object data){ 32 | //第一次先删除 33 | redis.delKey(key); 34 | //开始写数据 35 | db.updateData(data); 36 | //等待一秒 37 | Thread.sleep(1000); 38 | //再次删除缓存 39 | redis.delKey(key); 40 | } 41 | //使用该方案需要考虑延迟的时间问题,要结合自己的写操作逻辑 42 | ``` 43 | **如果你用了MySQL的读写分离架构怎么办?** 44 | 45 | 在这种情况下,造成数据不一致的原因如下,还是两个请求,一个请求A进行更新操作,另一个请求B进行查询操作。 46 | 47 | 1. 请求A进行写操作,删除缓存; 48 | 49 | 2. 请求A将数据写入数据库了; 50 | 51 | 3. 请求B查询缓存发现,缓存没有值; 52 | 53 | 4. 请求B去从库查询,这时,还没有完成主从同步,因此查询到的是旧值; 54 | 55 | 5. 请求B将旧值写入缓存; 56 | 57 | 数据库完成主从同步,从库变为新值。 58 | 59 | 上述情形,就是数据不一致的原因。还是使用双删延时策略。只是,睡眠时间修改为在主从同步的延时时间基础上,加几百ms。 60 | 61 | **采用这种同步淘汰策略,吞吐量降低怎么办?** 62 | 63 | 那就将第二次删除作为异步的。自己起一个线程,异步删除。这样,写的请求就不用沉睡一段时间再返回。这么做,加大吞吐量。 64 | 65 | **第二次删除,如果删除失败怎么办?** 66 | 67 | 这是个非常好的问题,因为第二次删除失败,就会出现如下情形。还是有两个请求,一个请求A进行更新操作,另一个请求B进行查询操作,为了方便,假设是单库: 68 | 69 | 1. 请求A进行写操作,删除缓存; 70 | 71 | 2. 请求B查询发现缓存不存在; 72 | 73 | 3. 请求B去数据库查询得到旧值; 74 | 75 | 4. 请求B将旧值写入缓存; 76 | 77 | 5. 请求A将新值写入数据库; 78 | 79 | 请求A试图去删除请求B写入对缓存值,结果失败了。 80 | 81 | 这也就是说,如果第二次删除缓存失败,会再次出现缓存和数据库不一致的问题。 82 | 83 | #### 先更新数据库,再删除缓存(推荐) 84 | --------------------- 85 | 86 | 在微软的一篇文章「cache aside pattern」中指出一种更新策略 87 | 88 | - 失效场景:先从缓存中获取数据,没有得到,从数据库中获取,成功后,加入缓存。 89 | - 命中场景:命中缓存中的数据,返回 90 | - 更新场景:先把数据保存到数据库中,成功后删除缓存,或让缓存失效 91 | **并发问题** 92 | 缓存刚好失效; 93 | 94 | 请求A查询数据库,得一个旧值; 95 | 96 | 请求B将新值写入数据库; 97 | 98 | 请求B删除缓存; 99 | 100 | 请求A将查到的旧值写入缓存。 101 | 此时确实会产生数据不一致的问题,但一般场景下,数据写入会更慢于查询。 102 | 103 | 解决方案就是->给缓存设定失效时间,或者使用延迟删除的策略 104 | 105 | **删除缓存失败怎么办** 106 | 答:**重试机制** 107 | 108 | - 使用消息队列来维护重试列表,每次失败时,我们可以将失败的key保存到消息队列中,一段时间后在进行重试,缺点是会对原有的业务代码造成侵入。 109 | - 第二种方案是,监控binlog,每当有操作时,将消息发送到另一各系统中,这样就不会对原有的业务代码造成侵入,问题是维护两套系统,不过该方案可以考虑使用已有的binlog工具。 110 | 111 | 参考:[分布式数据库与缓存双写一致性方案解疑][1] 112 | 113 | [1]:https://mp.weixin.qq.com/s/ICABpJJkeaFoOO0qeAa2cA -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/discovery/client/DistributeClient.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.discovery.client; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.zookeeper.ZooKeeper; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.concurrent.CountDownLatch; 9 | 10 | @Slf4j 11 | public class DistributeClient { 12 | 13 | private static final String connectString = "localhost:2181"; 14 | 15 | private static final Integer sessionTimeout = 2000; 16 | 17 | private static final String parentNode = "/Servers"; 18 | 19 | private ZooKeeper zk = null; 20 | 21 | private volatile List serverList = null; 22 | 23 | private static CountDownLatch countDownLatch = new CountDownLatch(1); 24 | 25 | /** 26 | * 异步或者zookeeper链接,注意要使用CountDownLatch阻塞 27 | * 28 | * @throws Exception 29 | */ 30 | public void getConnect() throws Exception { 31 | zk = new ZooKeeper(connectString, sessionTimeout, event -> { 32 | try { 33 | log.info("链接成功,准备获取信息"); 34 | countDownLatch.countDown(); 35 | getServerList(); 36 | } catch (Exception e) { 37 | log.error("获取信息失败!{}", e.getMessage()); 38 | } 39 | }); 40 | countDownLatch.await(); 41 | } 42 | 43 | 44 | /** 45 | * 获取服务列表 46 | * 47 | * @throws Exception 48 | */ 49 | public void getServerList() throws Exception { 50 | /** 51 | * 读取数据,可以获取到节点列表和节点数据, 52 | */ 53 | // List children = zk.getChildren(parentNode, true); 54 | List children = zk.getChildren(parentNode, event -> { 55 | try { 56 | /** 57 | * 支持自定义Watch,在节点变更时会发送NodeChildrenChanged事件 58 | * 不过Watch仅一次有效 59 | */ 60 | log.debug(event.getType().toString()); 61 | log.debug(event.getState().toString()); 62 | log.info("此刻有节点变更事件产生!"); 63 | getServerList(); 64 | } catch (Exception e) { 65 | log.error("注册的Watch调用失败。"); 66 | } 67 | }); 68 | 69 | List list = new ArrayList<>(); 70 | 71 | for (String child : children) { 72 | /** 73 | * 可以根据路径,获取节点中保存的数据,同样getChildren支持Watch注册 74 | * 在节点数据发生变化时,可以发送事件。NodeDataChanged 75 | */ 76 | log.info("服务节点路径为:{}",child); 77 | byte[] data = zk.getData(parentNode + "/" + child, false, null); 78 | list.add(new String(data)); 79 | } 80 | serverList = list; 81 | handlerService(); 82 | } 83 | 84 | /** 85 | * 打印服务列表 86 | */ 87 | public void handlerService() { 88 | if (serverList.size() < 1) { 89 | log.info("当前无可用节点"); 90 | return; 91 | } 92 | serverList.forEach(server -> log.info("当前在线服务有:{}", server)); 93 | } 94 | 95 | public static void main(String[] args) throws Exception { 96 | DistributeClient client = new DistributeClient(); 97 | client.getConnect(); 98 | Thread.sleep(Long.MAX_VALUE); 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/discovery/server/DistributeServer.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.discovery.server; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.zookeeper.CreateMode; 5 | import org.apache.zookeeper.ZooDefs.Ids; 6 | import org.apache.zookeeper.ZooKeeper; 7 | 8 | import java.util.concurrent.CountDownLatch; 9 | 10 | @Slf4j 11 | public class DistributeServer { 12 | 13 | //zookeeper连接地址 14 | private static final String connectString = "localhost:2181"; 15 | //超时时间 用于zookeeper判断当前节点等待心跳最长时间 16 | private static final Integer sessionTimeout = 2000; 17 | //根节点 ------ 启动前需要手动创建 18 | private static final String parentNode = "/Servers"; 19 | 20 | private ZooKeeper zk = null; 21 | 22 | private static CountDownLatch countDownLatch = new CountDownLatch(1); 23 | 24 | 25 | /** 26 | * 创建链接 27 | * 28 | * @throws Exception 29 | */ 30 | public void getConnect() throws Exception { 31 | zk = new ZooKeeper(connectString, sessionTimeout, event -> { 32 | log.info("链接状态更改----{}", event.getState()); 33 | countDownLatch.countDown(); 34 | }); 35 | /** 36 | * 由于创建是异步的可能会导致链接未创建就执行 37 | * 所以这里使用CountDownLatch进行阻塞 38 | * 39 | * 在链接创建后使用Watch进行解除阻塞。 40 | */ 41 | countDownLatch.await(); 42 | //根据两个能够确定一个会话,可以实现客户端会话复用 43 | log.info("sessionId为:{}", zk.getSessionId()); 44 | log.info("会话密钥为:{}", zk.getSessionPasswd()); 45 | 46 | 47 | } 48 | 49 | /** 50 | * @param @param hostName 51 | * @param @throws Exception 52 | * @param @throws InterruptedException 53 | * @return void 54 | * @throws 55 | * @Title: regServer 56 | * @Description: 向zookeeper注册服务 57 | */ 58 | public void regServer(String hostName) throws Exception { 59 | //支持异步创建,不支持递归创建,即不存在父节点的情况下不可以创建 60 | String creatPath = zk.create(parentNode + "/server", hostName.getBytes(), Ids.OPEN_ACL_UNSAFE, 61 | CreateMode.EPHEMERAL); 62 | log.info("{}-------- is on line-----{}", hostName, creatPath); 63 | } 64 | 65 | /** 66 | * @param hostName 67 | */ 68 | public void handleService(String hostName) { 69 | log.info("{} start working", hostName); 70 | } 71 | 72 | public static void main(String[] args) throws Exception { 73 | 74 | DistributeServer distributeServer = new DistributeServer(); 75 | 76 | distributeServer.getConnect(); 77 | 78 | distributeServer.regServer(args[0]); 79 | 80 | distributeServer.handleService(args[0]); 81 | 82 | Thread.sleep(Long.MAX_VALUE); 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/discovery/服务注册与发现.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/src/main/java/com/tools/zookeeper/discovery/服务注册与发现.md -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/election/Broker_1.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.election; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.curator.framework.CuratorFramework; 5 | 6 | import java.util.concurrent.CountDownLatch; 7 | 8 | @Slf4j 9 | public class Broker_1 { 10 | private static final CountDownLatch shutdownLatch = new CountDownLatch(1); 11 | private static final String name = "Broker_1"; 12 | 13 | public static void main(String[] args) throws InterruptedException { 14 | ZkElectionUtil electionUtil = new ZkElectionUtil(); 15 | try { 16 | electionUtil.electionMaster(name.getBytes()); 17 | } catch (Exception e) { 18 | e.printStackTrace(); 19 | } 20 | shutdownLatch.await(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/election/Broker_2.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.election; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | 5 | import java.util.concurrent.CountDownLatch; 6 | 7 | @Slf4j 8 | public class Broker_2 { 9 | private static final CountDownLatch shutdownLatch = new CountDownLatch(1); 10 | private static final String name = "Broker_2"; 11 | 12 | public static void main(String[] args) throws InterruptedException { 13 | ZkElectionUtil electionUtil = new ZkElectionUtil(); 14 | try { 15 | electionUtil.electionMaster(name.getBytes()); 16 | } catch (Exception e) { 17 | e.printStackTrace(); 18 | } 19 | shutdownLatch.await(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/election/Broker_3.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.election; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.curator.framework.CuratorFramework; 5 | 6 | import java.util.concurrent.CountDownLatch; 7 | 8 | @Slf4j 9 | public class Broker_3 { 10 | private static final CountDownLatch shutdownLatch = new CountDownLatch(1); 11 | private static final String name = "Broker_3"; 12 | 13 | public static void main(String[] args) throws InterruptedException { 14 | ZkElectionUtil electionUtil = new ZkElectionUtil(); 15 | try { 16 | electionUtil.electionMaster(name.getBytes()); 17 | } catch (Exception e) { 18 | e.printStackTrace(); 19 | } 20 | shutdownLatch.await(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/election/ZkElectionUtil.java: -------------------------------------------------------------------------------- 1 | package com.tools.zookeeper.election; 2 | 3 | import lombok.extern.slf4j.Slf4j; 4 | import org.apache.curator.framework.CuratorFramework; 5 | import org.apache.curator.framework.CuratorFrameworkFactory; 6 | import org.apache.curator.framework.api.CuratorWatcher; 7 | import org.apache.curator.retry.ExponentialBackoffRetry; 8 | import org.apache.zookeeper.CreateMode; 9 | 10 | import java.util.Arrays; 11 | import java.util.Objects; 12 | 13 | 14 | @Slf4j 15 | public class ZkElectionUtil { 16 | private static final String CONNECTSTRING = "localhost:2181"; 17 | private static final int SESSIONTIMEOUT = 2000; 18 | private static final String LOCKNODE = "/rootNode/lock"; 19 | private static final CuratorFramework client; 20 | // 初始化客户端 21 | static { 22 | ExponentialBackoffRetry exponentialBackoffRetry = new ExponentialBackoffRetry(SESSIONTIMEOUT, 3); 23 | client = CuratorFrameworkFactory.newClient(CONNECTSTRING, exponentialBackoffRetry); 24 | client.start(); 25 | } 26 | 27 | /** 28 | * 创建节点 29 | * @param data 30 | * @return 31 | */ 32 | private boolean getLock(byte[] data) { 33 | try { 34 | if (Objects.isNull(client.checkExists().forPath(LOCKNODE))) { 35 | client.create() 36 | .creatingParentContainersIfNeeded() 37 | .withMode(CreateMode.EPHEMERAL) 38 | .forPath(LOCKNODE, data); 39 | } else { 40 | return false; 41 | } 42 | } catch (Exception e) { 43 | log.warn("create node path fail, reason: {}", e.getMessage()); 44 | return false; 45 | } 46 | return true; 47 | } 48 | 49 | /** 50 | * 选主 51 | * @param data 52 | * @throws Exception 53 | */ 54 | void electionMaster(byte[] data) throws Exception { 55 | //尝试创建zk临时节点 56 | if (getLock(data)) { 57 | log.info("now you are leader"); 58 | } else { 59 | log.warn("now you are follower, leader was: {}", getLeader()); 60 | client.getData() 61 | // 每次选举失败,重新注册节点监听事件 62 | .usingWatcher((CuratorWatcher) event -> { 63 | log.info("leader node was changed, will start election"); 64 | // 递归调用 65 | electionMaster(data); 66 | }) 67 | .forPath(LOCKNODE); 68 | } 69 | } 70 | 71 | /** 72 | * 获取创建成功的数据 73 | * @return 74 | */ 75 | private String getLeader() { 76 | try { 77 | return Arrays.toString(client.getData().forPath(LOCKNODE)); 78 | } catch (Exception e) { 79 | log.error("get leader error: {}",e.getMessage()); 80 | } 81 | return "no leader"; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/com/tools/zookeeper/zookeeper选举机制.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sev7e0/bigdata-practice/ffbdd93bd555fd388d4dd20ccc3379124a3eae5f/src/main/java/com/tools/zookeeper/zookeeper选举机制.pdf -------------------------------------------------------------------------------- /src/main/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | --------------------------------------------------------------------------------