├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── engine_example ├── Makefile ├── README ├── data_store.cc ├── data_store.h ├── door_plate.cc ├── door_plate.h ├── engine_example.cc ├── engine_example.h ├── util.cc └── util.h ├── engine_java ├── .gitignore ├── jvm_parameters ├── pom.xml └── src │ └── main │ ├── java │ ├── com │ │ └── alibabacloud │ │ │ └── polar_race │ │ │ └── engine │ │ │ └── common │ │ │ ├── AbstractEngine.java │ │ │ ├── AbstractVisitor.java │ │ │ ├── EngineRace.java │ │ │ ├── EngineTest.java │ │ │ ├── RangeTest.java │ │ │ ├── ReadTest.java │ │ │ ├── WriteTest.java │ │ │ └── exceptions │ │ │ ├── EngineException.java │ │ │ └── RetCodeEnum.java │ └── moe │ │ └── cnkirito │ │ ├── directio │ │ ├── DirectChannel.java │ │ ├── DirectChannelImpl.java │ │ ├── DirectIOLib.java │ │ ├── DirectIOUtils.java │ │ ├── DirectRandomAccessFile.java │ │ └── OpenFlags.java │ │ └── kiritodb │ │ ├── KiritoDB.java │ │ ├── common │ │ ├── Constant.java │ │ ├── LoopQuerySemaphore.java │ │ ├── UnsafeUtil.java │ │ └── Util.java │ │ ├── data │ │ ├── CommitLog.java │ │ └── CommitLogAware.java │ │ ├── index │ │ ├── ArrayMemoryIndex.java │ │ ├── CommitLogIndex.java │ │ ├── HppcMemoryIndex.java │ │ └── MemoryIndex.java │ │ ├── partition │ │ ├── FirstBytePartitoner.java │ │ ├── HighTenPartitioner.java │ │ └── Partitionable.java │ │ └── range │ │ ├── CacheItem.java │ │ ├── FetchDataProducer.java │ │ ├── LocalVisitor.java │ │ └── RangeTask.java │ └── resources │ └── log4j.properties ├── engine_race ├── Makefile ├── engine_race.cc └── engine_race.h ├── include ├── engine.h └── polar_string.h └── test ├── README └── test.cc /.gitignore: -------------------------------------------------------------------------------- 1 | # maven ignore 2 | target/ 3 | *.jar 4 | !.mvn/wrapper/* 5 | *.war 6 | *.zip 7 | *.tar 8 | *.tar.gz 9 | 10 | # eclipse ignore 11 | .settings/ 12 | .project 13 | .classpath 14 | 15 | # idea ignore 16 | .idea/ 17 | *.ipr 18 | *.iml 19 | *.iws 20 | 21 | # temp ignore 22 | *.log 23 | *.cache 24 | *.diff 25 | *.patch 26 | *.tmp 27 | 28 | # system ignore 29 | .DS_Store 30 | Thumbs.db 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 xujingfeng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CLEAN_FILES = # deliberately empty, so we can append below. 2 | CXX=g++ 3 | PLATFORM_LDFLAGS= -lpthread -lrt 4 | PLATFORM_CXXFLAGS= -std=c++11 5 | PROFILING_FLAGS=-pg 6 | OPT= 7 | LDFLAGS += -Wl,-rpath=$(RPATH) 8 | 9 | # DEBUG_LEVEL can have two values: 10 | # * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile benchmark 11 | # without any optimizations. To compile with level 2, issue `make dbg` 12 | # * DEBUG_LEVEL=0; this is the debug level we use for release. If you're 13 | # running benchmark in production you most definitely want to compile benchmark 14 | # with debug level 0. To compile with level 0, run `make`, 15 | 16 | # Set the default DEBUG_LEVEL to 0 17 | DEBUG_LEVEL?=0 18 | 19 | ifeq ($(MAKECMDGOALS),dbg) 20 | DEBUG_LEVEL=2 21 | endif 22 | 23 | # compile with -O2 if debug level is not 2 24 | ifneq ($(DEBUG_LEVEL), 2) 25 | OPT += -O2 -fno-omit-frame-pointer 26 | # if we're compiling for release, compile without debug code (-DNDEBUG) and 27 | # don't treat warnings as errors 28 | OPT += -DNDEBUG 29 | DISABLE_WARNING_AS_ERROR=1 30 | # Skip for archs that don't support -momit-leaf-frame-pointer 31 | ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1)) 32 | OPT += -momit-leaf-frame-pointer 33 | endif 34 | else 35 | $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production) 36 | OPT += $(PROFILING_FLAGS) 37 | DEBUG_SUFFIX = "_debug" 38 | endif 39 | 40 | # ----------------Dependences------------------- 41 | 42 | INCLUDE_PATH = -I./ 43 | 44 | # ---------------End Dependences---------------- 45 | 46 | LIB_SOURCES := $(wildcard $(SRC_PATH)/*.cc) 47 | 48 | #----------------------------------------------- 49 | 50 | AM_DEFAULT_VERBOSITY = 0 51 | 52 | AM_V_GEN = $(am__v_GEN_$(V)) 53 | am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) 54 | am__v_GEN_0 = @echo " GEN " $(notdir $@); 55 | am__v_GEN_1 = 56 | AM_V_at = $(am__v_at_$(V)) 57 | am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) 58 | am__v_at_0 = @ 59 | am__v_at_1 = 60 | 61 | AM_V_CC = $(am__v_CC_$(V)) 62 | am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) 63 | am__v_CC_0 = @echo " CC " $(notdir $@); 64 | am__v_CC_1 = 65 | CCLD = $(CC) 66 | LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ 67 | AM_V_CCLD = $(am__v_CCLD_$(V)) 68 | am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) 69 | am__v_CCLD_0 = @echo " CCLD " $(notdir $@); 70 | am__v_CCLD_1 = 71 | 72 | AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) 73 | 74 | CXXFLAGS += -g 75 | 76 | # This (the first rule) must depend on "all". 77 | default: all 78 | 79 | WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare \ 80 | -Wno-unused-parameter -Woverloaded-virtual \ 81 | -Wnon-virtual-dtor -Wno-missing-field-initializers 82 | 83 | ifndef DISABLE_WARNING_AS_ERROR 84 | WARNING_FLAGS += -Werror 85 | endif 86 | 87 | CXXFLAGS += $(WARNING_FLAGS) $(INCLUDE_PATH) $(PLATFORM_CXXFLAGS) $(OPT) 88 | 89 | LDFLAGS += $(PLATFORM_LDFLAGS) 90 | 91 | # ---------------------------------------------- 92 | ifeq ($(TARGET_ENGINE),) 93 | TARGET_ENGINE = engine_race 94 | endif 95 | SUB_PATH = $(CURDIR)/$(TARGET_ENGINE) 96 | 97 | LIBOUTPUT = $(CURDIR)/lib 98 | dummy := $(shell mkdir -p $(LIBOUTPUT)) 99 | LIBRARY = $(LIBOUTPUT)/${LIBNAME}.a 100 | 101 | .PHONY: clean dbg all 102 | 103 | %.o: %.cc 104 | $(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ 105 | 106 | all: $(LIBRARY) 107 | 108 | dbg: $(LIBRARY) 109 | 110 | $(LIBRARY): 111 | $(AM_V_at)make -C $(SUB_PATH) DEBUG_LEVEL=$(DEBUG_LEVEL) LIBOUTPUT=$(LIBOUTPUT) EXEC_DIR=$(CURDIR) 112 | 113 | clean: 114 | make -C $(SUB_PATH) LIBOUTPUT=$(LIBOUTPUT) clean 115 | rm -f $(LIBRARY) 116 | rm -rf $(CLEAN_FILES) 117 | rm -rf $(LIBOUTPUT) 118 | find $(SRC_PATH) -maxdepth 1 -name "*.[oda]*" -exec rm -f {} \; 119 | find $(SRC_PATH) -maxdepth 1 -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## PolarDB数据库性能大赛Java选手分享 2 | 3 | ### 1 前言 4 | 5 | ![排名](http://kirito.iocoder.cn/image-20181210184521001.png) 6 | 7 | 国际惯例,先报成绩,熬了无数个夜晚,最后依旧被绝杀出了第一页,最终排名第 21 名。前十名的成绩分布为 413.69~416.94,我最终的耗时是 422.43。成绩虽然不是特别亮眼,但与众多参赛选手使用 C++ 作为参赛语言不同,我使用的是 Java,一方面是我 C++ 的能力早已荒废,另一方面是我想验证一下使用 Java 编写存储引擎是否与 C++ 差距巨大(当然,主要还是前者 QAQ)。所以在本文中,我除了介绍整体的架构之外,还会着重笔墨来探讨 Java 编写存储类型应用的一些最佳实践,文末会给出 github 的开源地址。 8 | 9 | ### 2 赛题概览 10 | 11 | 比赛总体分成了初赛和复赛两个阶段,整体要求实现一个简化、高效的 kv 存储引擎 12 | 13 | 初赛要求支持 Write、Read 接口。 14 | 15 | ```java 16 | public abstract void write(byte[] key, byte[] value); 17 | public abstract byte[] read(byte[] key); 18 | ``` 19 | 20 | 复赛在初赛题目基础上,还需要额外实现一个 Range 接口。 21 | 22 | ```java 23 | public abstract void range(byte[] lower, byte[] upper, AbstractVisitor visitor); 24 | ``` 25 | 26 | 程序评测逻辑 分为2个阶段: 27 | 1)Recover 正确性评测: 28 | 此阶段评测程序会并发写入特定数据(key 8B、value 4KB)同时进行任意次 kill -9 来模拟进程意外退出(参赛引擎需要保证进程意外退出时数据持久化不丢失),接着重新打开 DB,调用 Read、Range 接口来进行正确性校验 29 | 30 | 2)性能评测 31 | - 随机写入:64 个线程并发随机写入,每个线程使用 Write 各写 100 万次随机数据(key 8B、value 4KB) 32 | - 随机读取:64 个线程并发随机读取,每个线程各使用 Read 读取 100 万次随机数据 33 | - 顺序读取:64 个线程并发顺序读取,每个线程各使用 Range 有序(增序)遍历全量数据 2 次 34 | 注: 35 | 2.2 阶段会对所有读取的 kv 校验是否匹配,如不通过则终止,评测失败; 36 | 2.3 阶段除了对迭代出来每条的 kv校 验是否匹配外,还会额外校验是否严格字典序递增,如不通过则终止,评测失败。 37 | 38 | 语言限定:C++ & JAVA,一起排名 39 | 40 | ### 3 赛题剖析 41 | 42 | 关于文件 IO 操作的一些基本常识,我已经在专题文章中进行了介绍,如果你没有浏览那篇文章,建议先行浏览一下:[文件IO操作的一些最佳实践](https://www.cnkirito.moe/file-io-best-practise/)。再回归赛题,先对赛题中的几个关键词来进行解读。 43 | 44 | #### 3.1 key 8B, value 4kb 45 | 46 | key 为固定的 8 字节,因此可使用 long 来表示。 47 | 48 | value 为 4kb,这节省了我们很大的工作量,因为 4kb 的整数倍落盘是非常磁盘 IO 友好的。 49 | 50 | value 为 4kb 的另一个好处是我们再内存做索引时,可以使用 int 而不是 long,来记录数据的逻辑偏移量:LogicOffset = PhysicalOffset / 4096,可以将 offset 的内存占用量减少一半。 51 | 52 | #### 3.2 kill -9 数据不丢失 53 | 54 | 首先赛题明确表示会进行 kill -9 并验证数据的一致性,这加大了我们在内存中做 write buffer 的难度。但它并没有要求断电不丢失,这间接地阐释了一点:我们可以使用 pageCache 来做写入缓存,在具体代码中我使用了 PageCache 来充当数据和索引的写入缓冲(两者策略不同)。同时这点也限制了参赛选手,不能使用 AIO 这样的异步落盘方式。 55 | 56 | #### 3.3 分阶段测评 57 | 58 | 赛题分为了随机写,随机读,顺序读三个阶段,每个阶段都会重新 open,且不会发生随机写到一半校验随机读这样的行为,所以我们在随机写阶段不需要在内存维护索引,而是直接落盘。随机读和顺序读阶段,磁盘均存在数据,open 阶段需要恢复索引,可以使用多线程并发恢复。 59 | 60 | **同时,赛题还有存在一些隐性的测评细节没有披露给大家,但通过测试,我们可以得知这些信息。** 61 | 62 | #### 3.4 清空 PageCache 的耗时 63 | 64 | 虽然我们可以使用 PageCache,但评测程序在每个阶段之后都使用脚本清空了 PageCache,并且将这部分时间也算进了最终的成绩之中,所以有人感到奇怪:三个阶段的耗时相加比输出出来的成绩要差,其实那几秒便是清空 PageCache 的耗时。 65 | 66 | ```shell 67 | #清理 pagecache (页缓存) 68 | sysctl -w vm.drop_caches=1 69 | #清理 dentries(目录缓存)和 inodes 70 | sysctl -w vm.drop_caches=2 71 | #清理pagecache、dentries和inodes 72 | sysctl -w vm.drop_caches=3 73 | ``` 74 | 75 | 这一点启发我们,不能毫无节制的使用 PageCache,也正是因为这一点,一定程度上使得 Direct IO 这一操作成了本次竞赛的银弹。 76 | 77 | #### 3.5 key 的分布 78 | 79 | 这一个隐性条件可谓是本次比赛的关键,因为它涉及到 Range 部分的架构设计。本次比赛的 key 共计 6400w,但是他们的分布都是**均匀**的,在[《文件IO操作的一些最佳实践》](https://www.cnkirito.moe/file-io-best-practise/) 一文中我们已经提到了数据分区的好处,可以大大减少顺序读写的锁冲突,而 key 的分布均匀这一特性,启发我们在做数据分区时,可以按照 key 的搞 n 位来做 hash,从而确保 key 两个分区之间整体有序(分区内部无序)。实际我尝试了将数据分成 1024、2048 个分区,效果最佳。 80 | 81 | #### 3.6 Range 的缓存设计 82 | 83 | 赛题要求 64 个线程 Range 两次全量的数据,限时 1h,这也启发了我们,如果不对数据进行缓存,想要在 1h 内完成比赛是不可能的,所以,我们的架构设计应该尽量以 Range 为核心,兼顾随机写和随机读。Range 部分也是最容易拉开差距的一个环节。 84 | 85 | ### 4 架构详解 86 | 87 | 首先需要明确的是,随机写指的是 key 的写入是随机的,但我们可以根据 key hash,将随机写转换为对应分区文件的顺序写。 88 | 89 | ```java 90 | /** 91 | * using high ten bit of the given key to determine which file it hits. 92 | */ 93 | public class HighTenPartitioner implements Partitionable { 94 | @Override 95 | public int getPartition(byte[] key) { 96 | return ((key[0] & 0xff) << 2) | ((key[1] & 0xff) >> 6); 97 | } 98 | } 99 | ``` 100 | 101 | 明确了高位分区的前提再来看整体的架构就变得明朗了 102 | 103 | **全局视角** 104 | 105 | ![全局视角](http://kirito.iocoder.cn/KiritoDB.png) 106 | 107 | **分区视角** 108 | 109 | ![分区视角](http://kirito.iocoder.cn/image-20181210204156199.png) 110 | 111 | **内存视角** 112 | 113 | 内存中仅仅维护有序的 `key[1024][625000]` 数组和 `offset[1024][625000]` 数组。 114 | 115 | 上述两张图对整体的架构进行了一个很好的诠释,利用数据分布均匀的特性,可以将全局数据 hash 成 1024 个分区,在每个分区中存放两类文件:索引文件和数据文件。在随机写入阶段,根据 key 获得该数据对应分区位置,并按照时序,顺序追加到文件末尾,将全局随机写转换为局部顺序写。利用索引和数据一一对应的特性,我们也不需要将 data 的逻辑偏移量落盘,在 recover 阶段可以按照恢复 key 的次序,反推出 value 的逻辑偏移量。 116 | 117 | 在 range 阶段,由于我们事先按照 key 的高 10 为做了分区,所以我们可以认定一个事实,patition(N) 中的任何一个数据一定大于 partition(N-1) 中的任何一个数据,于是我们可以采用大块读,将一个 partition 整体读进内存,供 64 个 visit 线程消费。到这儿便奠定了整体的基调:读盘线程负责按分区读盘进入内存,64 个 visit 线程负责消费内存,按照 key 的次序随机访问内存,进行 Visitor 的回调。 118 | 119 | ### 5 随机写流程 120 | 121 | 介绍完了整体架构,我们分阶段来看一下各个阶段的一些细节优化点,有一些优化在各个环节都会出现,未避免重复,第二次出现的同一优化点我就不赘述了,仅一句带过。 122 | 123 | #### 使用 pageCache 实现写入缓冲区 124 | 125 | 主要看数据落盘,后讨论索引落盘。磁盘 IO 类型的比赛,第一步便是测量磁盘的 IOPS 以及多少个线程一次读写多大的缓存能够打满 IO,在固定 64 线程写入的前提下,16kb,64kb 均可以达到最理想 IOPS,所以理所当然的想到,可以为每一个分区分配一个写入缓存,凑齐 4 个 value 落盘。但是此次比赛,要做到 kill -9 不丢失数据,不能简单地在内存中分配一个 `ByteBuffer.allocate(4096 * 4);`, 而是可以考虑使用 mmap 内存映射出一片写入缓冲,凑齐 4 个刷盘,这样在 kill -9 之后,PageCache 不会丢失。实测 16kb 落盘比 4kb 落盘要快 6s 左右。 126 | 127 | 索引文件的落盘则没有太大的争议,由于 key 的数据量为固定的 8B,所以 mmap 可以发挥出它写小数据的优势,将 pageCache 利用起来,实测 mmap 相比 filechannel 写索引要快 3s 左右,相信如果把 polardb 这块盘换做其他普通的 ssd,这个数值还要增加。 128 | 129 | #### 写入时不维护内存索引,不写入数据偏移 130 | 131 | 一开始审题不清,在随机写之后误以为会立刻随机读,实际上每个阶段都是独立的,所以不需要在写入时维护内存索引;其次,之前的架构图中也已经提及,不需要写入连带 key+offset 一起写入文件,recover 阶段可以按照恢复索引的顺序,反推出 data 的逻辑偏移,因为我们的 key 和 data 在同一个分区内的位置是一一对应的。 132 | 133 | ### 6 恢复流程 134 | 135 | recover 阶段的逻辑实际上包含在程序的 open 接口之中,我们需要再数据库引擎启动时,将索引从数据文件恢复到内存之中,在这之中也存在一些细节优化点。 136 | 137 | 由于 1024 个分区的存在,我们可以使用 64 个线程 (经验值) 并发地恢复索引,使用快速排序对 `key[1024][625000]` 数组和 `offset[1024][625000]` 进行 sort,之后再 compact,对 key 进行去重。需要注意的一点是,不要使用结构体,将 key 和 offset 封装在一起,这会使得排序和之后的二分效率非常低,这之中涉及到 CPU 缓存行的知识点,不了解的读者可以翻阅我之前的博客: [《CPU Cache 与缓存行》](https://www.cnkirito.moe/cache-line/) 138 | 139 | ```java 140 | // wrong 141 | public class KeyOffset { 142 | long key; 143 | int offset; 144 | } 145 | ``` 146 | 147 | 整个 recover 阶段耗时为 1s,跟 cpp 选手交流后发现恢复流程比之慢了 600ms,这中间让我觉得比较诡异,加载索引和排序不应该这么慢才对,最终也没有优化成功。 148 | 149 | ### 7 随机读流程 150 | 151 | 随机读流程没有太大的优化点,优化空间实在有限,实现思路便是先根据 key 定位到分区,之后在有序的 key 数据中二分查找到 key/offset,拿到 data 的逻辑偏移和分区编号,便可以愉快的随机读了,随机读阶段没有太大的优化点,但仍然比 cpp 选手慢了 2-3s,可能是语言无法越过的差距。 152 | 153 | ### 8 顺序读流程 154 | 155 | Range 环节是整个比赛的大头,也是拉开差距的分水岭。前面我们已经大概提到了 Range 的整体思路是一个生产者消费者模型,n 个生成者负责从磁盘读数据进入内存(n 作为变量,通过 benchmark 来确定多少合适,最终实测 n 为 4 时效果最佳),64 个消费者负责调用 visit 回调,来验证数据,visit 过程就是随机读内存的过程。在 Range 阶段,剩余的内存还有大概 1G 左右,所以我分配了 4 个堆外缓冲,一个 256M,从而可以缓存 4 个分区的数据,并且,我为每一个分区分配了一个读盘线程,负责 load 数据进入缓存,供 64 个消费者消费。 156 | 157 | 具体的顺序读架构可以参见下图: 158 | 159 | ![range](http://kirito.iocoder.cn/image-20181210215200345.png) 160 | 161 | 大体来看,便是 4 个 fetch 线程负责读盘,fetch thread n 负责 `partitionNo % 4 == n` 编号的分区,完成后通知 visit 消费。这中间充斥着比较多的互斥等待逻辑,并未在图中体现出来,大体如下: 162 | 163 | 1. fetch thread 1~4 加载磁盘数据进入缓存是并发的 164 | 2. visit group 1~64 访问同一个 buffer 是并发的 165 | 3. visit group 1~64 访问不同 partition 对应的 buffer 是按照次序来进行的(打到全局有序) 166 | 4. 加载 partitonN 会阻塞 visit bufferN,visit bufferN 会阻塞加载 partitionN+4(相当于复用4块缓存) 167 | 168 | 大块的加载读进缓存,最大程度复用,是 ReadSeq 部分的关键。顺序读两轮的成绩在 196~198s 左右,相比 C++ 又慢了 4s 左右。 169 | 170 | ### 9 魔鬼在细节中 171 | 172 | 这儿是个分水岭,介绍完了整体架构和四个阶段的细节实现,下面就是介绍下具体的优化点了。 173 | 174 | ### 10 Java 实现 Direct IO 175 | 176 | 由于这次比赛将 drop cache 的时间算进了测评程序之中,所以在不必要的地方应当尽量避免 pageCache,也就是说除了写索引之外,其他阶段不应该出现 pageCache。这对于 Java 选手来说可能是不小的障碍,因为 Java 原生没有提供 Direct IO,需要自己封装一套 JNA 接口,封装这套接口借鉴了开源框架 jaydio 的思路,感谢@尘央的协助,大家可以在文末的代码中看到实现细节。这一点可以说是拦住了一大票 Java 选手。 177 | 178 | Direct IO 需要注意的两个细节: 179 | 180 | 1. 分配的内存需要对齐,对应 jna 方法:posix_memalign 181 | 2. 写入的数据需要对齐通常是 pageSize 的整数倍,实际使用了 pread 的 O_DIRECT 182 | 183 | ### 11 直接内存优于堆内内存 184 | 185 | 这一点在《文件IO操作的一些最佳实践》中有所提及,堆外内存的两大好处是减少了一份内存拷贝,并且对 gc 友好,在 Direct IO 的实现中,应该配备一套堆外内存的接口,才能发挥出最大的功效。尤其在 Range 阶段,一个缓存区的大小便对应一个 partition 数据分区的大小:256M,大块的内存,更加适合用 DirectByteBuffer 装载。 186 | 187 | ### 12 JVM 调优 188 | 189 | ```shell 190 | -server -Xms2560m -Xmx2560m -XX:MaxDirectMemorySize=1024m -XX:NewRatio=4 -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:-UseBiasedLocking 191 | ``` 192 | 193 | 众所周知 newRatio 控制的是 young 区和 old 区大小的比例,官方推荐参数为 `-XX:NewRatio=1`,很多不注意的 Java 选手可能没有意识去修改它,会在无形中被 gc 拖累。经过和@阿杜的讨论,最终得出的结论: 194 | 195 | 1. young 区过大,对象在年轻代待得太久,多次拷贝 196 | 2. old 区过小,会频繁触发 old 区的 cms gc 197 | 198 | 在比赛中这显得尤为重要,`-XX:NewRatio=4` 放大老年代可以有效的减少 cms gc 的次数,将 126 次 cms gc,下降到最终的 5 次。 199 | 200 | ### 13 池化对象 201 | 202 | 无论是 apache 的 ObjectPool 还是 Netty 中的 Recycler,还是 RingBuffer 中预先分配的对象,都在传达一种思想,对于那些反复需要 new 出来的东西,都可以池化,分配内存再回收,这也是一笔不小的开销。在此次比赛的场景下,没必要大费周章地动用对象池,直接一个 ThreadLocal 即可搞定,事实上我对 key/value 的写入和读取都进行了 ThreadLocal 的缓存,做到了永远不再循环中分配对象。 203 | 204 | ### 14 减少线程切换 205 | 206 | 无论是网络 IO 还是磁盘 IO,io worker 线程的时间片都显得尤为的可贵,在我的架构中,range 阶段主要分为了两类线程:64 个 visit 线程并发随机读内存,4 个 io 线程并发读磁盘。木桶效应,我们很容易定位到瓶颈在于 4 个 io 线程,在 wait/notify 的模型中,为了尽可能的减少 io 线程的时间片流失,可以考虑使用 while(true) 进行轮询,而 visit 线程则可以 sleep(1us) 避免 cpu 空转带来的整体性能下降,由于评测机拥有 64 core,所以这样的分配算是较为合理的,为此我实现了一个简单粗暴的信号量。 207 | 208 | ```java 209 | public class LoopQuerySemaphore { 210 | 211 | private volatile boolean permit; 212 | 213 | public LoopQuerySemaphore(boolean permit) { 214 | this.permit = permit; 215 | } 216 | 217 | // for 64 visit thread 218 | public void acquire() throws InterruptedException { 219 | while (!permit) { 220 | Thread.sleep(0,1); 221 | } 222 | permit = false; 223 | } 224 | 225 | // for 4 fetch thread 226 | public void acquireNoSleep() throws InterruptedException { 227 | while (!permit) { 228 | } 229 | permit = false; 230 | } 231 | 232 | public void release() { 233 | permit = true; 234 | } 235 | 236 | } 237 | ``` 238 | 239 | 正确的在 IO 中 acquireNoSleep,在 Visit 中 acquire,可以让成绩相比使用普通的阻塞 Semaphore 提升 6s 左右。 240 | 241 | ### 15 绑核 242 | 243 | 线上机器的抖动在所难免,避免 IO 线程的切换也并不仅仅能够用依靠 while(true) 的轮询,一个 CPU 级别的优化便是腾出 4 个核心专门给 IO 线程使用,完全地避免 IO 线程的时间片争用。在 Java 中这也不难实现,依赖万能的 github,我们可以轻松地实现 Affinity。github 传送门:https://github.com/OpenHFT/Java-Thread-Affinity 244 | 245 | 使用方式: 246 | 247 | ```java 248 | try (final AffinityLock al2 = AffinityLock.acquireLock()) { 249 | // do fetch ... 250 | } 251 | ``` 252 | 253 | 这个方式可以让你的代码快 1~2 s,并且保持测评的稳定性。 254 | 255 | ### 0 聊聊 FileChannel,MMAP,Direct IO,聊聊比赛 256 | 257 | 我在最终版本的代码中,几乎完全抛弃了 FileChannel,事实上,在不 Drop Cache 的场景下,它已经可以发挥出它利用 PageCache 的一些优势,并且优秀的 Java 存储引擎都主要使用了 FileChannel 来进行读写,在少量的场景下,使用了 MMAP 作为辅助,毕竟,MMAP 在写小数据量文件时存在其价值。 258 | 259 | 另外需要注意的一点,在跟@96年的亚普长谈的一个夜晚,发现 FileChannel 中出人意料的一个实现,在分配对内内存时,它仍然会拷贝一份堆外内存,这对于实际使用 FileChannel 的场景需要额外注意,这部分意料之外分配的内存很容易导致线上的问题(实际上已经遇到了,和 glibc 的 malloc 相关,当 buffer 大于 128k 时,会使用 mmap 分配一块内存作为缓存) 260 | 261 | 说回 FileChannel,MMAP,最容易想到的是 RocketMQ 之中对两者灵活的运用,不知道在其他 Java 实现的存储引擎之中,是不是可以考虑使用 Direct IO 来提升存储引擎的性能呢?我们可以设想一下,利用有限并且少量的 PageCache 来保证一致性,在主流程中使用 Direct IO 配合顺序读写是不是一种可以配套使用的方案,不仅仅 PolarDB,算作是参加本次比赛给予我的一个启发。 262 | 263 | 虽然无缘决赛,但使用 Java 取得这样的成绩还算不是特别难过,在 6400w 数据随机写,随机读,顺序读的场景下,Java 可以做到仅仅相差 C++ 不到 10s 的 overhead,我倒是觉得完全是可以接受的,哈哈。还有一些小的优化点就不在此赘述了,欢迎留言与我交流优化点和比赛感悟。 264 | -------------------------------------------------------------------------------- /engine_example/Makefile: -------------------------------------------------------------------------------- 1 | CLEAN_FILES = # deliberately empty, so we can append below. 2 | CXX=g++ 3 | PLATFORM_LDFLAGS= -lpthread -lrt 4 | PLATFORM_CXXFLAGS= -std=c++11 5 | PROFILING_FLAGS=-pg 6 | OPT= 7 | LDFLAGS += -Wl,-rpath=$(RPATH) 8 | 9 | # DEBUG_LEVEL can have two values: 10 | # * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile benchmark 11 | # without any optimizations. To compile with level 2, issue `make dbg` 12 | # * DEBUG_LEVEL=0; this is the debug level we use for release. If you're 13 | # running benchmark in production you most definitely want to compile benchmark 14 | # with debug level 0. To compile with level 0, run `make`, 15 | 16 | # Set the default DEBUG_LEVEL to 0 17 | DEBUG_LEVEL?=0 18 | 19 | ifeq ($(MAKECMDGOALS),dbg) 20 | DEBUG_LEVEL=2 21 | endif 22 | 23 | # compile with -O2 if debug level is not 2 24 | ifneq ($(DEBUG_LEVEL), 2) 25 | OPT += -O2 -fno-omit-frame-pointer 26 | # if we're compiling for release, compile without debug code (-DNDEBUG) and 27 | # don't treat warnings as errors 28 | OPT += -DNDEBUG 29 | DISABLE_WARNING_AS_ERROR=1 30 | # Skip for archs that don't support -momit-leaf-frame-pointer 31 | ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1)) 32 | OPT += -momit-leaf-frame-pointer 33 | endif 34 | else 35 | $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production) 36 | OPT += $(PROFILING_FLAGS) 37 | DEBUG_SUFFIX = "_debug" 38 | endif 39 | 40 | # ---------------------------------------------- 41 | SRC_PATH = $(CURDIR) 42 | 43 | # ----------------Dependences------------------- 44 | 45 | INCLUDE_PATH = -I./ 46 | 47 | # ---------------End Dependences---------------- 48 | 49 | LIB_SOURCES := $(wildcard $(SRC_PATH)/*.cc) 50 | 51 | #----------------------------------------------- 52 | 53 | AM_DEFAULT_VERBOSITY = 0 54 | 55 | AM_V_GEN = $(am__v_GEN_$(V)) 56 | am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) 57 | am__v_GEN_0 = @echo " GEN " $(notdir $@); 58 | am__v_GEN_1 = 59 | AM_V_at = $(am__v_at_$(V)) 60 | am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) 61 | am__v_at_0 = @ 62 | am__v_at_1 = 63 | 64 | AM_V_CC = $(am__v_CC_$(V)) 65 | am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) 66 | am__v_CC_0 = @echo " CC " $(notdir $@); 67 | am__v_CC_1 = 68 | CCLD = $(CC) 69 | LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ 70 | AM_V_CCLD = $(am__v_CCLD_$(V)) 71 | am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) 72 | am__v_CCLD_0 = @echo " CCLD " $(notdir $@); 73 | am__v_CCLD_1 = 74 | 75 | AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) 76 | 77 | CXXFLAGS += -g 78 | 79 | # This (the first rule) must depend on "all". 80 | default: all 81 | 82 | WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare \ 83 | -Wno-unused-parameter -Woverloaded-virtual \ 84 | -Wnon-virtual-dtor -Wno-missing-field-initializers 85 | 86 | ifndef DISABLE_WARNING_AS_ERROR 87 | WARNING_FLAGS += -Werror 88 | endif 89 | 90 | CXXFLAGS += $(WARNING_FLAGS) $(INCLUDE_PATH) $(PLATFORM_CXXFLAGS) $(OPT) 91 | 92 | LDFLAGS += $(PLATFORM_LDFLAGS) 93 | 94 | LIBOBJECTS = $(LIB_SOURCES:.cc=.o) 95 | # if user didn't config LIBNAME, set the default 96 | ifeq ($(LIBNAME),) 97 | # we should only run benchmark in production with DEBUG_LEVEL 0 98 | LIBNAME=libengine$(DEBUG_SUFFIX) 99 | endif 100 | 101 | ifeq ($(LIBOUTPUT),) 102 | LIBOUTPUT=$(CURDIR)/lib 103 | endif 104 | 105 | ifeq ($(EXEC_DIR),) 106 | EXEC_DIR=$(CURDIR) 107 | endif 108 | 109 | dummy := $(shell mkdir -p $(LIBOUTPUT)) 110 | LIBRARY = $(LIBOUTPUT)/${LIBNAME}.a 111 | INCLUDE_PATH += -I$(EXEC_DIR) 112 | 113 | .PHONY: clean dbg all 114 | 115 | %.o: %.cc 116 | $(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ 117 | 118 | all: $(LIBRARY) 119 | 120 | dbg: $(LIBRARY) 121 | 122 | $(LIBRARY): $(LIBOBJECTS) 123 | $(AM_V_at)rm -f $@ 124 | $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) 125 | 126 | clean: 127 | rm -f $(LIBRARY) 128 | rm -rf $(CLEAN_FILES) 129 | rm -rf $(LIBOUTPUT) 130 | find $(SRC_PATH) -maxdepth 1 -name "*.[oda]*" -exec rm -f {} \; 131 | find $(SRC_PATH) -maxdepth 1 -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; 132 | -------------------------------------------------------------------------------- /engine_example/README: -------------------------------------------------------------------------------- 1 | engine_example is a very simple engine, as a demo to show how to develop a Key-Value engine within the race framework 2 | -------------------------------------------------------------------------------- /engine_example/data_store.cc: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "util.h" 7 | #include "data_store.h" 8 | 9 | namespace polar_race { 10 | 11 | static const char kDataFilePrefix[] = "DATA_"; 12 | static const int kDataFilePrefixLen = 5; 13 | static const int kSingleFileSize = 1024 * 1024 * 100; 14 | 15 | static std::string FileName(const std::string &dir, uint32_t fileno) { 16 | return dir + "/" + kDataFilePrefix + std::to_string(fileno); 17 | } 18 | 19 | RetCode DataStore::Init() { 20 | if (!FileExists(dir_) 21 | && 0 != mkdir(dir_.c_str(), 0755)) { 22 | return kIOError; 23 | } 24 | 25 | std::vector files; 26 | if (0 != GetDirFiles(dir_, &files)) { 27 | return kIOError; 28 | } 29 | 30 | uint32_t last_no = 0; 31 | uint32_t cur_offset = 0; 32 | 33 | // Get the last data file no 34 | std::string sindex; 35 | std::vector::iterator it; 36 | for (it = files.begin(); it != files.end(); ++it) { 37 | if ((*it).compare(0, kDataFilePrefixLen, kDataFilePrefix) != 0) { 38 | continue; 39 | } 40 | sindex = (*it).substr(kDataFilePrefixLen); 41 | if (std::stoul(sindex) > last_no) { 42 | last_no = std::stoi(sindex); 43 | } 44 | } 45 | 46 | // Get last data file offset 47 | int len = GetFileLength(FileName(dir_, last_no)); 48 | if (len > 0) { 49 | cur_offset = len; 50 | } 51 | 52 | next_location_.file_no = last_no; 53 | next_location_.offset = cur_offset; 54 | 55 | // Open file 56 | return OpenCurFile(); 57 | } 58 | 59 | RetCode DataStore::Append(const std::string& value, Location* location) { 60 | if (value.size() > kSingleFileSize) { 61 | return kInvalidArgument; 62 | } 63 | 64 | if (next_location_.offset + value.size() > kSingleFileSize) { 65 | // Swtich to new file 66 | close(fd_); 67 | next_location_.file_no += 1; 68 | next_location_.offset = 0; 69 | OpenCurFile(); 70 | } 71 | 72 | // Append write 73 | if (0 != FileAppend(fd_, value)) { 74 | return kIOError; 75 | } 76 | location->file_no = next_location_.file_no; 77 | location->offset = next_location_.offset; 78 | location->len = value.size(); 79 | 80 | next_location_.offset += location->len; 81 | return kSucc; 82 | } 83 | 84 | RetCode DataStore::Read(const Location& l, std::string* value) { 85 | int fd = open(FileName(dir_, l.file_no).c_str(), O_RDONLY, 0644); 86 | if (fd < 0) { 87 | return kIOError; 88 | } 89 | lseek(fd, l.offset, SEEK_SET); 90 | 91 | char* buf = new char[l.len](); 92 | char* pos = buf; 93 | uint32_t value_len = l.len; 94 | 95 | while (value_len > 0) { 96 | ssize_t r = read(fd, pos, value_len); 97 | if (r < 0) { 98 | if (errno == EINTR) { 99 | continue; // Retry 100 | } 101 | close(fd); 102 | return kIOError; 103 | } 104 | pos += r; 105 | value_len -= r; 106 | } 107 | *value = std::string(buf, l.len); 108 | 109 | delete buf; 110 | close(fd); 111 | return kSucc; 112 | } 113 | 114 | RetCode DataStore::OpenCurFile() { 115 | std::string file_name = FileName(dir_, next_location_.file_no); 116 | int fd = open(file_name.c_str(), O_APPEND | O_WRONLY | O_CREAT, 0644); 117 | if (fd < 0) { 118 | return kIOError; 119 | } 120 | fd_ = fd; 121 | return kSucc; 122 | } 123 | 124 | } // namespace polar_race 125 | -------------------------------------------------------------------------------- /engine_example/data_store.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef ENGINE_SIMPLE_DATA_STORE_H_ 3 | #define ENGINE_SIMPLE_DATA_STORE_H_ 4 | #include 5 | #include 6 | #include 7 | #include "include/engine.h" 8 | 9 | namespace polar_race { 10 | 11 | struct Location { 12 | Location() : file_no(0), offset(0), len(0) { 13 | } 14 | uint32_t file_no; 15 | uint32_t offset; 16 | uint32_t len; 17 | }; 18 | 19 | class DataStore { 20 | public: 21 | explicit DataStore(const std::string dir) 22 | : fd_(-1), dir_(dir) {} 23 | 24 | ~DataStore() { 25 | if (fd_ > 0) { 26 | close(fd_); 27 | } 28 | } 29 | 30 | RetCode Init(); 31 | RetCode Read(const Location& l, std::string* value); 32 | RetCode Append(const std::string& value, Location* location); 33 | 34 | private: 35 | int fd_; 36 | std::string dir_; 37 | Location next_location_; 38 | 39 | RetCode OpenCurFile(); 40 | }; 41 | 42 | } // namespace polar_race 43 | #endif // ENGINE_SIMPLE_DATA_STORE_H_ 44 | -------------------------------------------------------------------------------- /engine_example/door_plate.cc: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "util.h" 11 | #include "door_plate.h" 12 | 13 | namespace polar_race { 14 | 15 | static const uint32_t kMaxDoorCnt = 1024 * 1024 * 32; 16 | static const char kMetaFileName[] = "META"; 17 | static const int kMaxRangeBufCount = kMaxDoorCnt; 18 | 19 | static bool ItemKeyMatch(const Item &item, const std::string& target) { 20 | if (target.size() != item.key_size 21 | || memcmp(item.key, target.data(), item.key_size) != 0) { 22 | // Conflict 23 | return false; 24 | } 25 | return true; 26 | } 27 | 28 | static bool ItemTryPlace(const Item &item, const std::string& target) { 29 | if (item.in_use == 0) { 30 | return true; 31 | } 32 | return ItemKeyMatch(item, target); 33 | } 34 | 35 | DoorPlate::DoorPlate(const std::string& path) 36 | : dir_(path), 37 | fd_(-1), 38 | items_(NULL) { 39 | } 40 | 41 | RetCode DoorPlate::Init() { 42 | bool new_create = false; 43 | const int map_size = kMaxDoorCnt * sizeof(Item); 44 | 45 | if (!FileExists(dir_) 46 | && 0 != mkdir(dir_.c_str(), 0755)) { 47 | return kIOError; 48 | } 49 | 50 | std::string path = dir_ + "/" + kMetaFileName; 51 | int fd = open(path.c_str(), O_RDWR, 0644); 52 | if (fd < 0 && errno == ENOENT) { 53 | // not exist, then create 54 | fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); 55 | if (fd >= 0) { 56 | new_create = true; 57 | if (posix_fallocate(fd, 0, map_size) != 0) { 58 | std::cerr << "posix_fallocate failed: " << strerror(errno) << std::endl; 59 | close(fd); 60 | return kIOError; 61 | } 62 | } 63 | } 64 | if (fd < 0) { 65 | return kIOError; 66 | } 67 | fd_ = fd; 68 | 69 | void* ptr = mmap(NULL, map_size, PROT_READ | PROT_WRITE, 70 | MAP_SHARED, fd_, 0); 71 | if (ptr == MAP_FAILED) { 72 | std::cerr << "MAP_FAILED: " << strerror(errno) << std::endl; 73 | close(fd); 74 | return kIOError; 75 | } 76 | if (new_create) { 77 | memset(ptr, 0, map_size); 78 | } 79 | 80 | items_ = reinterpret_cast(ptr); 81 | return kSucc; 82 | } 83 | 84 | DoorPlate::~DoorPlate() { 85 | if (fd_ > 0) { 86 | const int map_size = kMaxDoorCnt * sizeof(Item); 87 | munmap(items_, map_size); 88 | close(fd_); 89 | } 90 | } 91 | 92 | // Very easy hash table, which deal conflict only by try the next one 93 | int DoorPlate::CalcIndex(const std::string& key) { 94 | uint32_t jcnt = 0; 95 | int index = StrHash(key.data(), key.size()) % kMaxDoorCnt; 96 | while (!ItemTryPlace(*(items_ + index), key) 97 | && ++jcnt < kMaxDoorCnt) { 98 | index = (index + 1) % kMaxDoorCnt; 99 | } 100 | 101 | if (jcnt == kMaxDoorCnt) { 102 | // full 103 | return -1; 104 | } 105 | return index; 106 | } 107 | 108 | RetCode DoorPlate::AddOrUpdate(const std::string& key, const Location& l) { 109 | if (key.size() > kMaxKeyLen) { 110 | return kInvalidArgument; 111 | } 112 | 113 | int index = CalcIndex(key); 114 | if (index < 0) { 115 | return kFull; 116 | } 117 | 118 | Item* iptr = items_ + index; 119 | if (iptr->in_use == 0) { 120 | // new item 121 | memcpy(iptr->key, key.data(), key.size()); 122 | iptr->key_size = key.size(); 123 | iptr->in_use = 1; // Place 124 | } 125 | iptr->location = l; 126 | return kSucc; 127 | } 128 | 129 | RetCode DoorPlate::Find(const std::string& key, Location *location) { 130 | int index = CalcIndex(key); 131 | if (index < 0 132 | || !ItemKeyMatch(*(items_ + index), key)) { 133 | return kNotFound; 134 | } 135 | 136 | *location = (items_ + index)->location; 137 | return kSucc; 138 | } 139 | 140 | RetCode DoorPlate::GetRangeLocation(const std::string& lower, 141 | const std::string& upper, 142 | std::map *locations) { 143 | int count = 0; 144 | for (Item *it = items_ + kMaxDoorCnt - 1; it >= items_; it--) { 145 | if (!it->in_use) { 146 | continue; 147 | } 148 | std::string key(it->key, it->key_size); 149 | if ((key >= lower || lower.empty()) 150 | && (key < upper || upper.empty())) { 151 | locations->insert(std::pair(key, it->location)); 152 | if (++count > kMaxRangeBufCount) { 153 | return kOutOfMemory; 154 | } 155 | } 156 | } 157 | return kSucc; 158 | } 159 | 160 | } // namespace polar_race 161 | -------------------------------------------------------------------------------- /engine_example/door_plate.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef ENGINE_EXAMPLE_DOOR_PLATE_H_ 3 | #define ENGINE_EXAMPLE_DOOR_PLATE_H_ 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "include/engine.h" 9 | #include "data_store.h" 10 | 11 | namespace polar_race { 12 | 13 | static const uint32_t kMaxKeyLen = 32; 14 | 15 | struct Item { 16 | Item() : key_size(0), in_use(0) { 17 | } 18 | Location location; 19 | char key[kMaxKeyLen]; 20 | uint32_t key_size; 21 | uint8_t in_use; 22 | }; 23 | 24 | // Hash index for key 25 | class DoorPlate { 26 | public: 27 | explicit DoorPlate(const std::string& path); 28 | ~DoorPlate(); 29 | 30 | RetCode Init(); 31 | 32 | RetCode AddOrUpdate(const std::string& key, const Location& l); 33 | 34 | RetCode Find(const std::string& key, Location *location); 35 | 36 | RetCode GetRangeLocation(const std::string& lower, const std::string& upper, 37 | std::map *locations); 38 | 39 | private: 40 | std::string dir_; 41 | int fd_; 42 | Item *items_; 43 | 44 | int CalcIndex(const std::string& key); 45 | }; 46 | 47 | } // namespace polar_race 48 | 49 | #endif // ENGINE_EXAMPLE_DOOR_PLATE_H_ 50 | -------------------------------------------------------------------------------- /engine_example/engine_example.cc: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "util.h" 7 | #include "engine_example.h" 8 | 9 | namespace polar_race { 10 | 11 | static const char kLockFile[] = "LOCK"; 12 | 13 | RetCode Engine::Open(const std::string& name, Engine** eptr) { 14 | return EngineExample::Open(name, eptr); 15 | } 16 | 17 | Engine::~Engine() { 18 | } 19 | 20 | RetCode EngineExample::Open(const std::string& name, Engine** eptr) { 21 | *eptr = NULL; 22 | EngineExample *engine_example = new EngineExample(name); 23 | 24 | RetCode ret = engine_example->plate_.Init(); 25 | if (ret != kSucc) { 26 | delete engine_example; 27 | return ret; 28 | } 29 | ret = engine_example->store_.Init(); 30 | if (ret != kSucc) { 31 | delete engine_example; 32 | return ret; 33 | } 34 | 35 | if (0 != LockFile(name + "/" + kLockFile, &(engine_example->db_lock_))) { 36 | delete engine_example; 37 | return kIOError; 38 | } 39 | 40 | *eptr = engine_example; 41 | return kSucc; 42 | } 43 | 44 | EngineExample::~EngineExample() { 45 | if (db_lock_) { 46 | UnlockFile(db_lock_); 47 | } 48 | } 49 | 50 | RetCode EngineExample::Write(const PolarString& key, const PolarString& value) { 51 | pthread_mutex_lock(&mu_); 52 | Location location; 53 | RetCode ret = store_.Append(value.ToString(), &location); 54 | if (ret == kSucc) { 55 | ret = plate_.AddOrUpdate(key.ToString(), location); 56 | } 57 | pthread_mutex_unlock(&mu_); 58 | return ret; 59 | } 60 | 61 | RetCode EngineExample::Read(const PolarString& key, std::string* value) { 62 | pthread_mutex_lock(&mu_); 63 | Location location; 64 | RetCode ret = plate_.Find(key.ToString(), &location); 65 | if (ret == kSucc) { 66 | value->clear(); 67 | ret = store_.Read(location, value); 68 | } 69 | pthread_mutex_unlock(&mu_); 70 | return ret; 71 | } 72 | 73 | RetCode EngineExample::Range(const PolarString& lower, const PolarString& upper, 74 | Visitor &visitor) { 75 | pthread_mutex_lock(&mu_); 76 | std::map locations; 77 | RetCode ret = plate_.GetRangeLocation(lower.ToString(), upper.ToString(), &locations); 78 | if (ret != kSucc) { 79 | pthread_mutex_unlock(&mu_); 80 | return ret; 81 | } 82 | 83 | std::string value; 84 | for (auto& pair : locations) { 85 | ret = store_.Read(pair.second, &value); 86 | if (kSucc != ret) { 87 | break; 88 | } 89 | visitor.Visit(pair.first, value); 90 | } 91 | pthread_mutex_unlock(&mu_); 92 | return ret; 93 | } 94 | 95 | } // namespace polar_race 96 | 97 | -------------------------------------------------------------------------------- /engine_example/engine_example.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef ENGINE_EXAMPLE_ENGINE_EXAMPLE_H_ 3 | #define ENGINE_EXAMPLE_ENGINE_EXAMPLE_H_ 4 | #include 5 | #include 6 | #include "include/engine.h" 7 | #include "util.h" 8 | #include "door_plate.h" 9 | #include "data_store.h" 10 | 11 | namespace polar_race { 12 | 13 | class EngineExample : public Engine { 14 | public: 15 | static RetCode Open(const std::string& name, Engine** eptr); 16 | 17 | explicit EngineExample(const std::string& dir) 18 | : mu_(PTHREAD_MUTEX_INITIALIZER), 19 | db_lock_(NULL), plate_(dir), store_(dir) { 20 | } 21 | 22 | ~EngineExample(); 23 | 24 | RetCode Write(const PolarString& key, 25 | const PolarString& value) override; 26 | 27 | RetCode Read(const PolarString& key, 28 | std::string* value) override; 29 | 30 | RetCode Range(const PolarString& lower, 31 | const PolarString& upper, 32 | Visitor &visitor) override; 33 | 34 | private: 35 | pthread_mutex_t mu_; 36 | FileLock* db_lock_; 37 | DoorPlate plate_; 38 | DataStore store_; 39 | }; 40 | 41 | } // namespace polar_race 42 | 43 | #endif // ENGINE_EXAMPLE_ENGINE_EXAMPLE_H_ 44 | -------------------------------------------------------------------------------- /engine_example/util.cc: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "util.h" 10 | 11 | namespace polar_race { 12 | 13 | static const int kA = 54059; // a prime 14 | static const int kB = 76963; // another prime 15 | static const int kFinish = 37; // also prime 16 | uint32_t StrHash(const char* s, int size) { 17 | uint32_t h = kFinish; 18 | while (size > 0) { 19 | h = (h * kA) ^ (s[0] * kB); 20 | s++; 21 | size--; 22 | } 23 | return h; 24 | } 25 | 26 | int GetDirFiles(const std::string& dir, std::vector* result) { 27 | int res = 0; 28 | result->clear(); 29 | DIR* d = opendir(dir.c_str()); 30 | if (d == NULL) { 31 | return errno; 32 | } 33 | struct dirent* entry; 34 | while ((entry = readdir(d)) != NULL) { 35 | if (strcmp(entry->d_name, "..") == 0 || strcmp(entry->d_name, ".") == 0) { 36 | continue; 37 | } 38 | result->push_back(entry->d_name); 39 | } 40 | closedir(d); 41 | return res; 42 | } 43 | 44 | int GetFileLength(const std::string& file) { 45 | struct stat stat_buf; 46 | int rc = stat(file.c_str(), &stat_buf); 47 | return rc == 0 ? stat_buf.st_size : -1; 48 | } 49 | 50 | int FileAppend(int fd, const std::string& value) { 51 | if (fd < 0) { 52 | return -1; 53 | } 54 | size_t value_len = value.size(); 55 | const char* pos = value.data(); 56 | while (value_len > 0) { 57 | ssize_t r = write(fd, pos, value_len); 58 | if (r < 0) { 59 | if (errno == EINTR) { 60 | continue; // Retry 61 | } 62 | return -1; 63 | } 64 | pos += r; 65 | value_len -= r; 66 | } 67 | return 0; 68 | } 69 | 70 | bool FileExists(const std::string& path) { 71 | return access(path.c_str(), F_OK) == 0; 72 | } 73 | 74 | static int LockOrUnlock(int fd, bool lock) { 75 | errno = 0; 76 | struct flock f; 77 | memset(&f, 0, sizeof(f)); 78 | f.l_type = (lock ? F_WRLCK : F_UNLCK); 79 | f.l_whence = SEEK_SET; 80 | f.l_start = 0; 81 | f.l_len = 0; // Lock/unlock entire file 82 | return fcntl(fd, F_SETLK, &f); 83 | } 84 | 85 | int LockFile(const std::string& fname, FileLock** lock) { 86 | *lock = NULL; 87 | int result = 0; 88 | int fd = open(fname.c_str(), O_RDWR | O_CREAT, 0644); 89 | if (fd < 0) { 90 | result = errno; 91 | } else if (LockOrUnlock(fd, true) == -1) { 92 | result = errno; 93 | close(fd); 94 | } else { 95 | FileLock* my_lock = new FileLock; 96 | my_lock->fd_ = fd; 97 | my_lock->name_ = fname; 98 | *lock = my_lock; 99 | } 100 | return result; 101 | } 102 | 103 | int UnlockFile(FileLock* lock) { 104 | int result = 0; 105 | if (LockOrUnlock(lock->fd_, false) == -1) { 106 | result = errno; 107 | } 108 | close(lock->fd_); 109 | delete lock; 110 | return result; 111 | } 112 | 113 | } // namespace polar_race 114 | -------------------------------------------------------------------------------- /engine_example/util.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef ENGINE_SIMPLE_UTIL_H_ 3 | #define ENGINE_SIMPLE_UTIL_H_ 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace polar_race { 10 | 11 | // Hash 12 | uint32_t StrHash(const char* s, int size); 13 | 14 | // Env 15 | int GetDirFiles(const std::string& dir, std::vector* result); 16 | int GetFileLength(const std::string& file); 17 | int FileAppend(int fd, const std::string& value); 18 | bool FileExists(const std::string& path); 19 | 20 | // FileLock 21 | class FileLock { 22 | public: 23 | FileLock() {} 24 | virtual ~FileLock() {} 25 | 26 | int fd_; 27 | std::string name_; 28 | 29 | private: 30 | // No copying allowed 31 | FileLock(const FileLock&); 32 | void operator=(const FileLock&); 33 | }; 34 | 35 | int LockFile(const std::string& f, FileLock** l); 36 | int UnlockFile(FileLock* l); 37 | 38 | } // namespace polar_race 39 | 40 | #endif // ENGINE_SIMPLE_UTIL_H_ 41 | -------------------------------------------------------------------------------- /engine_java/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | .idea -------------------------------------------------------------------------------- /engine_java/jvm_parameters: -------------------------------------------------------------------------------- 1 | -server -Xms2560m -Xmx2560m -XX:MaxDirectMemorySize=1024m -XX:NewRatio=4 -XX:+UseConcMarkSweepGC -XX:+UseParNewGC -XX:-UseBiasedLocking -XX:-RestrictContended -------------------------------------------------------------------------------- /engine_java/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.alibabacloud.polar_race 6 | engine_java 7 | 0.0.1-SNAPSHOT 8 | 9 | 10 | 11 | org.apache.maven.plugins 12 | maven-compiler-plugin 13 | 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | maven-assembly-plugin 20 | 2.2.1 21 | 22 | 23 | 24 | com.alibabacloud.polar_race.engine.common.EngineTest 25 | 26 | 27 | EngineTest 28 | false 29 | false 30 | 31 | jar-with-dependencies 32 | 33 | 34 | 35 | 36 | org.apache.maven 37 | maven-core 38 | 2.2.1 39 | 40 | 41 | 42 | 43 | make-assembly 44 | package 45 | 46 | single 47 | 48 | 49 | 50 | 51 | 52 | 53 | jar 54 | 55 | engine_java 56 | http://maven.apache.org 57 | 58 | 59 | UTF-8 60 | 61 | 62 | 63 | 64 | junit 65 | junit 66 | 4.12 67 | 68 | 69 | log4j 70 | log4j 71 | 1.2.12 72 | 73 | 74 | org.slf4j 75 | slf4j-api 76 | 1.7.25 77 | 78 | 79 | org.slf4j 80 | slf4j-log4j12 81 | 1.7.25 82 | 83 | 84 | com.carrotsearch 85 | hppc 86 | 0.8.1 87 | 88 | 89 | net.smacke 90 | jaydio 91 | 0.1 92 | 93 | 94 | 95 | net.openhft 96 | affinity 97 | 3.1.11 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/AbstractEngine.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | 5 | /** 6 | * Copyright [2018] Alibaba Cloud All rights reserved 7 | *

8 | * Complete the functions below to implement your own engine 9 | */ 10 | 11 | public abstract class AbstractEngine { 12 | /** 13 | * open Engine 14 | * 15 | * @param path the path of engine store data. 16 | * @throws EngineException 17 | */ 18 | public abstract void open(String path) throws EngineException; 19 | 20 | /** 21 | * close Engine 22 | */ 23 | public abstract void close(); 24 | 25 | /** 26 | * mmapWrite a key-value pair into engine 27 | * 28 | * @param key 29 | * @param value 30 | * @throws EngineException 31 | */ 32 | public abstract void write(byte[] key, byte[] value) throws EngineException; 33 | 34 | /** 35 | * read value of a key 36 | * 37 | * @param key 38 | * @return value 39 | * @throws EngineException 40 | */ 41 | public abstract byte[] read(byte[] key) throws EngineException; 42 | 43 | /** 44 | * applies the given AbstractVisitor.Visit() function to the result of every key-value pair in the key range [first, last), 45 | * in order 46 | * 47 | * @param lower start key 48 | * @param upper end key 49 | * @param visitor is check key-value pair,you just call visitor.visit(String key, String value) function in your own engine. 50 | * @throws EngineException 51 | */ 52 | public abstract void range(byte[] lower, byte[] upper, AbstractVisitor visitor) throws EngineException; 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/AbstractVisitor.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | public abstract class AbstractVisitor { 4 | public abstract void visit(byte[] key, byte[] value); 5 | } 6 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/EngineRace.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | import moe.cnkirito.kiritodb.KiritoDB; 5 | 6 | public class EngineRace extends AbstractEngine { 7 | 8 | KiritoDB kiritoDB = new KiritoDB(); 9 | 10 | @Override 11 | public void open(String path) throws EngineException { 12 | kiritoDB.open(path); 13 | } 14 | 15 | @Override 16 | public void write(byte[] key, byte[] value) throws EngineException { 17 | kiritoDB.write(key, value); 18 | } 19 | 20 | @Override 21 | public byte[] read(byte[] key) throws EngineException { 22 | return kiritoDB.read(key); 23 | } 24 | 25 | @Override 26 | public void range(byte[] lower, byte[] upper, AbstractVisitor visitor) throws EngineException { 27 | kiritoDB.range(lower, upper, visitor); 28 | } 29 | 30 | @Override 31 | public void close() { 32 | kiritoDB.close(); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/EngineTest.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | 5 | import java.io.File; 6 | 7 | /** 8 | * @author daofeng.xjf 9 | * @date 2018/12/3 10 | */ 11 | public class EngineTest { 12 | public static void main(String[] args) throws EngineException { 13 | File file = new File("/tmp/kiritoDB"); 14 | deleteDir(file); 15 | new WriteTest().test(); 16 | new ReadTest().test(); 17 | new RangeTest().test(); 18 | } 19 | 20 | private static boolean deleteDir(File dir) { 21 | if (dir.isDirectory()) { 22 | String[] children = dir.list(); 23 | //递归删除目录中的子目录下 24 | for (int i = 0; i < children.length; i++) { 25 | boolean success = deleteDir(new File(dir, children[i])); 26 | if (!success) { 27 | return false; 28 | } 29 | } 30 | } 31 | // 目录此时为空,可以删除 32 | return dir.delete(); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/RangeTest.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | import moe.cnkirito.kiritodb.range.LocalVisitor; 5 | 6 | public class RangeTest { 7 | 8 | public static void main(String[] args) throws EngineException { 9 | new RangeTest().test(); 10 | } 11 | 12 | public void test() throws EngineException { 13 | long start = System.currentTimeMillis(); 14 | final EngineRace engine = new EngineRace(); 15 | engine.open("/tmp/kiritoDB"); 16 | 17 | 18 | Thread[] threads = new Thread[64]; 19 | for (int i = 0; i < 64; i++) { 20 | threads[i] = new Thread(() -> { 21 | try { 22 | for (int k = 0; k < 2; k++) { 23 | engine.range(null, null, new LocalVisitor()); 24 | } 25 | } catch (EngineException e) { 26 | e.printStackTrace(); 27 | } 28 | }, "thread" + i); 29 | } 30 | for (int i = 0; i < 64; i++) { 31 | threads[i].start(); 32 | } 33 | for (int i = 0; i < 64; i++) { 34 | try { 35 | threads[i].join(); 36 | } catch (InterruptedException e) { 37 | e.printStackTrace(); 38 | } 39 | } 40 | 41 | engine.close(); 42 | long end = System.currentTimeMillis(); 43 | System.out.println("range cost " + (end - start) + "ms"); 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/ReadTest.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | import moe.cnkirito.kiritodb.common.Util; 5 | 6 | /** 7 | * 读测试 8 | */ 9 | public class ReadTest { 10 | 11 | public void test() throws EngineException { 12 | // 记录启动时间 13 | long start = System.currentTimeMillis(); 14 | 15 | EngineRace engine = new EngineRace(); 16 | engine.open("/tmp/kiritoDB"); 17 | int len = 64000; 18 | byte[] hashByte = new byte[Byte.MAX_VALUE - Byte.MIN_VALUE + 1]; 19 | byte now = Byte.MIN_VALUE; 20 | for (int i = 0; i < (Byte.MAX_VALUE - Byte.MIN_VALUE + 1); i++) { 21 | hashByte[i] = now++; 22 | } 23 | for (int i = 0; i < len; i++) { 24 | try { 25 | byte[] bytes = Util.long2bytes(i); 26 | bytes[0] = hashByte[i % (Byte.MAX_VALUE - Byte.MIN_VALUE + 1)]; 27 | byte[] bs = engine.read(bytes); 28 | long ans = Util.bytes2Long(bs); 29 | if (i != ans) { 30 | System.err.println("no equal:" + i); 31 | } 32 | } catch (Exception e) { 33 | System.out.println(i); 34 | e.printStackTrace(); 35 | } 36 | } 37 | 38 | engine.close(); 39 | 40 | long end = System.currentTimeMillis(); 41 | System.out.println("readrandom cost" + (end - start) + "ms"); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/WriteTest.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | import moe.cnkirito.kiritodb.common.Util; 5 | 6 | import java.util.concurrent.CountDownLatch; 7 | import java.util.concurrent.Executor; 8 | import java.util.concurrent.ExecutorService; 9 | import java.util.concurrent.Executors; 10 | import java.util.concurrent.atomic.AtomicInteger; 11 | 12 | /** 13 | * 写测试 14 | */ 15 | public class WriteTest { 16 | public void test() throws EngineException { 17 | 18 | // 记录启动时间 19 | long start = System.currentTimeMillis(); 20 | 21 | // fixed 线程池 22 | Executor executor = Executors.newFixedThreadPool(64); 23 | 24 | // 打开引擎 25 | final EngineRace engine = new EngineRace(); 26 | engine.open("/tmp/kiritoDB"); 27 | write(executor, engine, 0); 28 | ((ExecutorService) executor).shutdownNow(); 29 | 30 | System.out.println(getFreeMemory()); 31 | engine.close(); 32 | long end = System.currentTimeMillis(); 33 | System.out.println("fillrandom cost " + (end - start) + "ms"); 34 | } 35 | 36 | private static void write(Executor executor, EngineRace engine, int offset) { 37 | // 写数据 38 | final AtomicInteger atomicInteger = new AtomicInteger(); 39 | int len = 64000; 40 | final CountDownLatch downLatch = new CountDownLatch(len); 41 | byte[] hashByte = new byte[Byte.MAX_VALUE - Byte.MIN_VALUE + 1]; 42 | byte now = Byte.MIN_VALUE; 43 | for (int i = 0; i < (Byte.MAX_VALUE - Byte.MIN_VALUE + 1); i++) { 44 | hashByte[i] = now++; 45 | } 46 | for (int i = 0; i < len; i++) { 47 | final int cur = i; 48 | executor.execute(new Runnable() { 49 | public void run() { 50 | try { 51 | byte[] bytes = Util.long2bytes(cur); 52 | bytes[0] = hashByte[cur % (Byte.MAX_VALUE - Byte.MIN_VALUE + 1)]; 53 | engine.write(bytes, Util._4kb(cur - offset)); 54 | System.out.println(atomicInteger.incrementAndGet()); 55 | downLatch.countDown(); 56 | } catch (EngineException e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | }); 61 | } 62 | try { 63 | downLatch.await(); 64 | } catch (InterruptedException e) { 65 | e.printStackTrace(); 66 | } 67 | } 68 | 69 | private static String getFreeMemory() { 70 | long free = Runtime.getRuntime().freeMemory() / 1024 / 1024; 71 | long total = Runtime.getRuntime().totalMemory() / 1024 / 1024; 72 | long max = Runtime.getRuntime().maxMemory() / 1024 / 1024; 73 | return "free=" + free + "M,total=" + total + "M,max=" + max; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/exceptions/EngineException.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common.exceptions; 2 | 3 | /** 4 | * When egine has some Exception throw,you can use this EngineException. 5 | * RetCodeEnum define some common return code and error code. 6 | */ 7 | public class EngineException extends Exception { 8 | public RetCodeEnum retCode; 9 | 10 | public EngineException(RetCodeEnum retCode, String msg) { 11 | super(msg); 12 | this.retCode = retCode; 13 | } 14 | 15 | 16 | } 17 | -------------------------------------------------------------------------------- /engine_java/src/main/java/com/alibabacloud/polar_race/engine/common/exceptions/RetCodeEnum.java: -------------------------------------------------------------------------------- 1 | package com.alibabacloud.polar_race.engine.common.exceptions; 2 | 3 | /** 4 | * RetCodeEnum define some return code. 5 | */ 6 | public enum RetCodeEnum { 7 | SUCC, 8 | NOT_FOUND, 9 | CORRUPTION, 10 | NOT_SUPPORTED, 11 | INVALID_ARGUMENT, 12 | IO_ERROR, 13 | INCOMPLETE, 14 | TIMEDOUT, 15 | FULL, 16 | OUT_OF_MEMORY 17 | } 18 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/DirectChannel.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | import java.io.IOException; 4 | import java.nio.ByteBuffer; 5 | import java.nio.channels.Channel; 6 | 7 | public interface DirectChannel extends Channel { 8 | /** 9 | * Writes from the src buffer into this channel at position.

10 | * 11 | * @param src 12 | * The {@link ByteBuffer} to write from 13 | * 14 | * @param position 15 | * The position within the file at which to start writing 16 | * 17 | * @return How many bytes were written from src into the file 18 | * @throws IOException 19 | */ 20 | int write(ByteBuffer src, long position) throws IOException; 21 | 22 | /** 23 | * Reads from this channel into the dst buffer from position.

24 | * 25 | * @param dst 26 | * The {@link ByteBuffer} to read into 27 | * 28 | * @param position 29 | * The position within the file at which to start reading 30 | * 31 | * @return How many bytes were placed into dst 32 | * @throws IOException 33 | */ 34 | int read(ByteBuffer dst, long position) throws IOException; 35 | 36 | /** 37 | * @return The file size for this channel 38 | */ 39 | long size(); 40 | 41 | /** 42 | * @return true if this channel is read only, false otherwise 43 | */ 44 | boolean isReadOnly(); 45 | 46 | /** 47 | * Truncates this file's length to fileLength.

48 | * 49 | * @param fileLength The length to which to truncate 50 | * 51 | * @return This UnsafeByteAlignedChannel 52 | * 53 | * @throws IOException 54 | */ 55 | DirectChannel truncate(long fileLength) throws IOException; 56 | 57 | /** 58 | * @return The file descriptor for this channel 59 | */ 60 | int getFD(); 61 | } -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/DirectChannelImpl.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.nio.ByteBuffer; 6 | import java.nio.channels.ClosedChannelException; 7 | import java.nio.channels.NonWritableChannelException; 8 | 9 | public class DirectChannelImpl implements DirectChannel { 10 | private DirectIOLib lib; 11 | private int fd; 12 | private boolean isOpen; 13 | private long fileLength; 14 | private boolean isReadOnly; 15 | 16 | public static DirectChannel getChannel(File file, boolean readOnly) throws IOException { 17 | DirectIOLib lib = DirectIOLib.getLibForPath(file.toString()); 18 | return getChannel(lib, file, readOnly); 19 | } 20 | 21 | public static DirectChannel getChannel(DirectIOLib lib, File file, boolean readOnly) throws IOException { 22 | int fd = lib.oDirectOpen(file.toString(), readOnly); 23 | long length = file.length(); 24 | return new DirectChannelImpl(lib, fd, length, readOnly); 25 | } 26 | 27 | private DirectChannelImpl(DirectIOLib lib, int fd, long fileLength, boolean readOnly) { 28 | this.lib = lib; 29 | this.fd = fd; 30 | this.isOpen = true; 31 | this.isReadOnly = readOnly; 32 | this.fileLength = fileLength; 33 | } 34 | 35 | private void ensureOpen() throws ClosedChannelException { 36 | if (!isOpen()) { 37 | throw new ClosedChannelException(); 38 | } 39 | } 40 | 41 | private void ensureWritable() { 42 | if (isReadOnly()) { 43 | throw new NonWritableChannelException(); 44 | } 45 | } 46 | 47 | @Override 48 | public int read(ByteBuffer dst, long position) throws IOException { 49 | ensureOpen(); 50 | return lib.pread(fd, dst, position); 51 | } 52 | 53 | @Override 54 | public int write(ByteBuffer src, long position) throws IOException { 55 | ensureOpen(); 56 | ensureWritable(); 57 | assert src.position() == lib.blockStart(src.position()); 58 | 59 | int written = lib.pwrite(fd, src, position); 60 | 61 | // update file length if we wrote past it 62 | fileLength = Math.max(position + written, fileLength); 63 | return written; 64 | } 65 | 66 | @Override 67 | public DirectChannel truncate(final long length) throws IOException { 68 | ensureOpen(); 69 | ensureWritable(); 70 | if (DirectIOLib.ftruncate(fd, length) < 0) { 71 | throw new IOException("Error during truncate on descriptor " + fd + ": " + 72 | DirectIOLib.getLastError()); 73 | } 74 | fileLength = length; 75 | return this; 76 | } 77 | 78 | @Override 79 | public long size() { 80 | return fileLength; 81 | } 82 | 83 | @Override 84 | public int getFD() { 85 | return fd; 86 | } 87 | 88 | 89 | @Override 90 | public boolean isOpen() { 91 | return isOpen; 92 | } 93 | 94 | @Override 95 | public boolean isReadOnly() { 96 | return isReadOnly; 97 | } 98 | 99 | @Override 100 | public void close() throws IOException { 101 | if (!isOpen()) { 102 | return; 103 | } 104 | try { 105 | if (!isReadOnly()) { 106 | truncate(fileLength); 107 | } 108 | } finally { 109 | isOpen = false; 110 | if (lib.close(fd) < 0) { 111 | throw new IOException("Error closing file with descriptor " + fd + ": " + 112 | DirectIOLib.getLastError()); 113 | } 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/DirectIOLib.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | import com.sun.jna.Native; 4 | import com.sun.jna.NativeLong; 5 | import com.sun.jna.Platform; 6 | import com.sun.jna.Pointer; 7 | import com.sun.jna.ptr.PointerByReference; 8 | import org.apache.log4j.Logger; 9 | import sun.nio.ch.DirectBuffer; 10 | 11 | import java.io.IOException; 12 | import java.nio.ByteBuffer; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | /** 17 | * Class containing native hooks and utility methods for performing direct I/O, using 18 | * the Linux O_DIRECT flag.

19 | * 20 | *

This class is initialized at class load time, by registering JNA hooks into native methods. 21 | * It also calculates Linux kernel version-dependent alignment amount (in bytes) for use with the O_DIRECT flag, 22 | * when given a string for a file or directory.

23 | */ 24 | public class DirectIOLib { 25 | private static final Logger logger = Logger.getLogger(DirectIOLib.class); 26 | public static boolean binit; 27 | 28 | static { 29 | binit = false; 30 | try { 31 | if (!Platform.isLinux()) { 32 | logger.warn("Not running Linux, jaydio support disabled"); 33 | } else { // now check to see if we have O_DIRECT... 34 | 35 | final int linuxVersion = 0; 36 | final int majorRev = 1; 37 | final int minorRev = 2; 38 | 39 | List versionNumbers = new ArrayList(); 40 | for (String v : System.getProperty("os.version").split("\\.|-")) { 41 | if (v.matches("\\d")) { 42 | versionNumbers.add(Integer.parseInt(v)); 43 | } 44 | } 45 | 46 | /* From "man 2 open": 47 | * 48 | * O_DIRECT support was added under Linux in kernel version 2.4.10. Older Linux kernels simply ignore this flag. Some file systems may not implement 49 | * the flag and open() will fail with EINVAL if it is used. 50 | */ 51 | 52 | // test to see whether kernel version >= 2.4.10 53 | if (versionNumbers.get(linuxVersion) > 2) { 54 | binit = true; 55 | } else if (versionNumbers.get(linuxVersion) == 2) { 56 | if (versionNumbers.get(majorRev) > 4) { 57 | binit = true; 58 | } else if (versionNumbers.get(majorRev) == 4 && versionNumbers.get(minorRev) >= 10) { 59 | binit = true; 60 | } 61 | } 62 | 63 | if (binit) { 64 | // get access to open(), pread(), etc 65 | Native.register(Platform.C_LIBRARY_NAME); 66 | } else { 67 | logger.warn(String.format("O_DIRECT not supported on your version of Linux: %d.%d.%d", linuxVersion, majorRev, minorRev)); 68 | } 69 | } 70 | } catch (Throwable e) { 71 | logger.warn("Unable to register libc at class load time: " + e.getMessage(), e); 72 | } 73 | } 74 | 75 | private int fsBlockSize; 76 | private long fsBlockNotMask; 77 | 78 | public DirectIOLib(int fsBlockSize) { 79 | this.fsBlockSize = fsBlockSize; 80 | this.fsBlockNotMask = ~((long)fsBlockSize - 1); 81 | } 82 | 83 | 84 | /** 85 | * Static method to register JNA hooks for doing direct I/O

86 | * 87 | * @param workingDir 88 | * A directory within the mounted file system on which we'll be working 89 | * Should preferably BE the directory in which we'll be working. 90 | */ 91 | public static DirectIOLib getLibForPath(String workingDir) { 92 | int fsBlockSize = initilizeSoftBlockSize(workingDir); 93 | if (fsBlockSize == -1) { 94 | logger.warn("O_DIRECT support non available on your version of Linux (" + System.getProperty("os.version") + "), " + 95 | "please upgrade your kernel in order to use jaydio."); 96 | return null; 97 | } 98 | return new DirectIOLib(fsBlockSize); 99 | } 100 | 101 | /** 102 | * Finds a block size for use with O_DIRECT. Choose it in the most paranoid 103 | * way possible to maximize probability that things work. 104 | * 105 | * @param fileOrDir 106 | * A file or directory within which O_DIRECT access will be performed. 107 | */ 108 | private static int initilizeSoftBlockSize(String fileOrDir) { 109 | 110 | int fsBlockSize = -1; 111 | 112 | if (binit) { 113 | // get file system block size for use with workingDir 114 | // see "man 3 posix_memalign" for why we do this 115 | final int _PC_REC_XFER_ALIGN = 0x11; 116 | 117 | fsBlockSize = pathconf(fileOrDir, _PC_REC_XFER_ALIGN); 118 | /* conservative for version >= 2.6 119 | * "man 2 open": 120 | * 121 | * Under Linux 2.6, alignment 122 | * to 512-byte boundaries suffices. 123 | */ 124 | 125 | // Since O_DIRECT requires pages to be memory aligned with the file system block size, 126 | // we will do this too in case the page size and the block size are different for 127 | // whatever reason. By taking the least common multiple, everything should be happy: 128 | int pageSize = getpagesize(); 129 | fsBlockSize = lcm(fsBlockSize, pageSize); 130 | 131 | // just being completely paranoid: 132 | // (512 is the rule for 2.6+ kernels as mentioned before) 133 | fsBlockSize = lcm(fsBlockSize, 512); 134 | 135 | // lastly, a sanity check 136 | if (fsBlockSize <= 0 || ((fsBlockSize & (fsBlockSize-1)) != 0)) { 137 | logger.warn("file system block size should be a power of two, was found to be " + fsBlockSize); 138 | logger.warn("Disabling O_DIRECT support"); 139 | return -1; 140 | } 141 | } 142 | 143 | return fsBlockSize; 144 | } 145 | 146 | 147 | // -- Java interfaces to native methods 148 | 149 | /** 150 | * Interface into native pread function. Always reads an entire buffer, 151 | * unlike {@link #pwrite(int, ByteBuffer, long) pwrite()} which uses buffer state 152 | * to determine how much of buffer to write.

153 | * 154 | * @param fd 155 | * A file discriptor to pass to native pread 156 | * 157 | * @param buf 158 | * The direct buffer into which to record the file read 159 | * 160 | * @param offset 161 | * The file offset at which to read 162 | * 163 | * @return The number of bytes successfully read from the file 164 | * 165 | * @throws IOException 166 | */ 167 | public int pread(int fd, ByteBuffer buf, long offset) throws IOException { 168 | buf.clear(); // so that we read an entire buffer 169 | final long address = ((DirectBuffer) buf).address(); 170 | Pointer pointer = new Pointer(address); 171 | int n = pread(fd, pointer, new NativeLong(buf.capacity()), new NativeLong(offset)).intValue(); 172 | if (n < 0) { 173 | throw new IOException("error reading file at offset " + offset + ": " + getLastError()); 174 | } 175 | return n; 176 | } 177 | 178 | /** 179 | * Interface into native pwrite function. Writes bytes corresponding to the nearest file 180 | * system block boundaries between buf.position() and buf.limit().

181 | * 182 | * @param fd 183 | * A file descriptor to pass to native pwrite 184 | * 185 | * @param buf 186 | * The direct buffer from which to write 187 | * 188 | * @param offset 189 | * The file offset at which to write 190 | * 191 | * @return The number of bytes successfully written to the file 192 | * 193 | * @throws IOException 194 | */ 195 | public int pwrite(int fd, ByteBuffer buf, long offset) throws IOException { 196 | 197 | // must always write to end of current block 198 | // To handle writes past the logical file size, 199 | // we will later truncate. 200 | final int start = buf.position(); 201 | assert start == blockStart(start); 202 | final int toWrite = blockEnd(buf.limit()) - start; 203 | 204 | final long address = ((DirectBuffer) buf).address(); 205 | Pointer pointer = new Pointer(address); 206 | 207 | int n = pwrite(fd, pointer.share(start), new NativeLong(toWrite), new NativeLong(offset)).intValue(); 208 | if (n < 0) { 209 | throw new IOException("error writing file at offset " + offset + ": " + getLastError()); 210 | } 211 | return n; 212 | } 213 | 214 | /** 215 | * Use the open Linux system call and pass in the O_DIRECT flag. 216 | * Currently the only other flags passed in are O_RDONLY if readOnly 217 | * is true, and (if not) O_RDWR and O_CREAT. 218 | * 219 | * @param pathname 220 | * The path to the file to open. If file does not exist and we are opening 221 | * with readOnly, this will throw an error. Otherwise, if it does 222 | * not exist but we have readOnly set to false, create the file. 223 | * 224 | * @param readOnly 225 | * Whether to pass in O_RDONLY 226 | * 227 | * @return An integer file descriptor for the opened file 228 | * 229 | * @throws IOException 230 | */ 231 | public int oDirectOpen(String pathname, boolean readOnly) throws IOException { 232 | int flags = OpenFlags.O_DIRECT; 233 | if (readOnly) { 234 | flags |= OpenFlags.O_RDONLY; 235 | } else { 236 | flags |= OpenFlags.O_RDWR | OpenFlags.O_CREAT; 237 | } 238 | int fd = open(pathname, flags, 00644); 239 | if (fd < 0) { 240 | throw new IOException("Error opening " + pathname + ", got " + getLastError()); 241 | } 242 | return fd; 243 | } 244 | 245 | /** 246 | * Hooks into errno using Native.getLastError(), and parses it with native strerror function. 247 | * 248 | * @return An error message corresponding to the last errno 249 | */ 250 | public static String getLastError() { 251 | return strerror(Native.getLastError()); 252 | } 253 | 254 | 255 | // -- alignment logic utility methods 256 | 257 | /** 258 | * @return The soft block size for use with transfer multiples 259 | * and memory alignment multiples 260 | */ 261 | public int blockSize() { 262 | return fsBlockSize; 263 | } 264 | 265 | /** 266 | * Returns the default buffer size for file channels doing O_DIRECT 267 | * I/O. By default this is equal to the block size. 268 | * 269 | * @return The default buffer size 270 | */ 271 | public int defaultBufferSize() { 272 | return fsBlockSize; 273 | } 274 | 275 | /** 276 | * Given value, find the largest number less than or equal 277 | * to value which is a multiple of the fs block size. 278 | * 279 | * @param value 280 | * @return The largest number less than or equal to value 281 | * which is a multiple of the soft block size 282 | */ 283 | public long blockStart(long value) { 284 | return value & fsBlockNotMask; 285 | } 286 | 287 | 288 | /** 289 | * @see #blockStart(long) 290 | */ 291 | public int blockStart(int value) { 292 | return (int) (value & fsBlockNotMask); 293 | } 294 | 295 | 296 | /** 297 | * Given value, find the smallest number greater than or equal 298 | * to value which is a multiple of the fs block size. 299 | * 300 | * @param value 301 | * @return The smallest number greater than or equal to value 302 | * which is a multiple of the soft block size 303 | */ 304 | public long blockEnd(long value) { 305 | return (value + fsBlockSize- 1) & fsBlockNotMask; 306 | } 307 | 308 | 309 | 310 | /** 311 | * @see #blockEnd(long) 312 | */ 313 | public int blockEnd(int value) { 314 | return (int) ((value + fsBlockSize - 1) & fsBlockNotMask); 315 | } 316 | 317 | 318 | /** 319 | * Static variant of {@link #blockEnd(int)}. 320 | * @param blockSize 321 | * @param position 322 | * @return The smallest number greater than or equal to position 323 | * which is a multiple of the blockSize 324 | */ 325 | public static long blockEnd(int blockSize, long position) { 326 | long ceil = (position + blockSize - 1)/blockSize; 327 | return ceil*blockSize; 328 | } 329 | 330 | 331 | /** 332 | * Euclid's algo for gcd is more general than we need 333 | * since we only have powers of 2, but w/e 334 | * @param x 335 | * @param y 336 | * @return The least common multiple of x and y 337 | */ 338 | public static int lcm(long x, long y) { 339 | // will hold gcd 340 | long g = x; 341 | long yc = y; 342 | 343 | // get the gcd first 344 | while (yc != 0) { 345 | long t = g; 346 | g = yc; 347 | yc = t % yc; 348 | } 349 | 350 | return (int)(x*y/g); 351 | } 352 | 353 | 354 | /** 355 | * Given a pointer-to-pointer memptr, sets the dereferenced value to point to the start 356 | * of an allocated block of size bytes, where the starting address is a multiple of 357 | * alignment. It is guaranteed that the block may be freed by calling @{link {@link #free(Pointer)} 358 | * on the starting address. See "man 3 posix_memalign". 359 | * 360 | * @param memptr The pointer-to-pointer which will point to the address of the allocated aligned block 361 | * 362 | * @param alignment The alignment multiple of the starting address of the allocated block 363 | * 364 | * @param size The number of bytes to allocate 365 | * 366 | * @return 0 on success, one of the C error codes on failure. 367 | */ 368 | public static native int posix_memalign(PointerByReference memptr, NativeLong alignment, NativeLong size); 369 | 370 | 371 | /** 372 | * See "man 3 free". 373 | * 374 | * @param ptr The pointer to the hunk of memory which needs freeing 375 | */ 376 | public static native void free(Pointer ptr); 377 | 378 | 379 | /** 380 | * See "man 2 close" 381 | * 382 | * @param fd The file descriptor of the file to close 383 | * 384 | * @return 0 on success, -1 on error 385 | */ 386 | public native int close(int fd); // musn't forget to do this 387 | 388 | // -- more native function hooks -- 389 | 390 | public static native int ftruncate(int fd, long length); 391 | 392 | private static native NativeLong pwrite(int fd, Pointer buf, NativeLong count, NativeLong offset); 393 | private static native NativeLong pread(int fd, Pointer buf, NativeLong count, NativeLong offset); 394 | private static native int open(String pathname, int flags); 395 | private static native int open(String pathname, int flags, int mode); 396 | private static native int getpagesize(); 397 | private static native int pathconf(String path, int name); 398 | private static native String strerror(int errnum); 399 | 400 | } 401 | 402 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/DirectIOUtils.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | import com.sun.jna.NativeLong; 4 | import com.sun.jna.Pointer; 5 | import com.sun.jna.ptr.PointerByReference; 6 | 7 | import java.lang.reflect.Method; 8 | import java.nio.ByteBuffer; 9 | import java.nio.ByteOrder; 10 | 11 | public class DirectIOUtils { 12 | public static final ByteOrder NATIVE_BYTE_ORDER = ByteOrder.nativeOrder(); 13 | 14 | /** 15 | * Allocate capacity bytes of native memory for use as a buffer, and 16 | * return a {@link ByteBuffer} which gives an interface to this memory. The 17 | * memory is allocated with 18 | * {@link DirectIOLib#posix_memalign(PointerByReference, NativeLong, NativeLong) DirectIOLib#posix_memalign()} 19 | * to ensure that the buffer can be used with O_DIRECT. 20 | ** 21 | * @param capacity The requested number of bytes to allocate 22 | * 23 | * @return A new JnaMemAlignedBuffer of capacity bytes aligned in native memory. 24 | */ 25 | public static ByteBuffer allocateForDirectIO(DirectIOLib lib, int capacity) { 26 | if (capacity % lib.blockSize() > 0) { 27 | throw new IllegalArgumentException("Capacity (" + capacity + ") must be a multiple" 28 | + "of the block size (" + lib.blockSize() + ")"); 29 | } 30 | NativeLong blockSize = new NativeLong(lib.blockSize()); 31 | PointerByReference pointerToPointer = new PointerByReference(); 32 | 33 | // align memory for use with O_DIRECT 34 | DirectIOLib.posix_memalign(pointerToPointer, blockSize, new NativeLong(capacity)); 35 | return wrapPointer(Pointer.nativeValue(pointerToPointer.getValue()), capacity); 36 | } 37 | 38 | /** 39 | * @param ptr Pointer to wrap. 40 | * @param len Memory location length. 41 | * @return Byte buffer wrapping the given memory. 42 | */ 43 | public static ByteBuffer wrapPointer(long ptr, int len) { 44 | try { 45 | ByteBuffer buf = (ByteBuffer)NEW_DIRECT_BUF_MTD.invoke(JAVA_NIO_ACCESS_OBJ, ptr, len, null); 46 | 47 | assert buf.isDirect(); 48 | return buf; 49 | } 50 | catch (ReflectiveOperationException e) { 51 | throw new RuntimeException("JavaNioAccess#newDirectByteBuffer() method is unavailable.", e); 52 | } 53 | } 54 | 55 | /** JavaNioAccess object. */ 56 | private static final Object JAVA_NIO_ACCESS_OBJ = javaNioAccessObject(); 57 | 58 | /** JavaNioAccess#newDirectByteBuffer method. */ 59 | private static final Method NEW_DIRECT_BUF_MTD = newDirectBufferMethod(); 60 | 61 | /** 62 | * Returns reference to {@code JavaNioAccess.newDirectByteBuffer} method 63 | * from private API for corresponding Java version. 64 | * 65 | * @return Reference to {@code JavaNioAccess.newDirectByteBuffer} method 66 | * @throws RuntimeException If getting access to the private API is failed. 67 | */ 68 | private static Method newDirectBufferMethod() { 69 | 70 | try { 71 | Class cls = JAVA_NIO_ACCESS_OBJ.getClass(); 72 | 73 | Method mtd = cls.getMethod("newDirectByteBuffer", long.class, int.class, Object.class); 74 | 75 | mtd.setAccessible(true); 76 | 77 | return mtd; 78 | } 79 | catch (ReflectiveOperationException e) { 80 | throw new RuntimeException(miscPackage() + ".JavaNioAccess#newDirectByteBuffer() method is unavailable.", e); 81 | } 82 | } 83 | 84 | /** 85 | * Returns {@code JavaNioAccess} instance from private API for corresponding Java version. 86 | * 87 | * @return {@code JavaNioAccess} instance for corresponding Java version. 88 | * @throws RuntimeException If getting access to the private API is failed. 89 | */ 90 | private static Object javaNioAccessObject() { 91 | String pkgName = miscPackage(); 92 | 93 | try { 94 | Class cls = Class.forName(pkgName + ".misc.SharedSecrets"); 95 | 96 | Method mth = cls.getMethod("getJavaNioAccess"); 97 | 98 | return mth.invoke(null); 99 | } 100 | catch (ReflectiveOperationException e) { 101 | throw new RuntimeException(pkgName + ".misc.JavaNioAccess class is unavailable.", e); 102 | } 103 | } 104 | 105 | private static String miscPackage() { 106 | // Need return 'jdk.interna' if current Java version >= 9 107 | return "sun"; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/DirectRandomAccessFile.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | import java.io.*; 4 | import java.nio.ByteBuffer; 5 | 6 | /** 7 | * Class to emulate the behavior of {@link RandomAccessFile}, but using direct I/O. 8 | * 9 | */ 10 | public class DirectRandomAccessFile implements Closeable { 11 | 12 | private DirectChannel channel; 13 | 14 | 15 | /** 16 | * @param file The file to open 17 | * 18 | * @param mode Either "rw" or "r", depending on whether this file is read only 19 | * 20 | * @throws IOException 21 | */ 22 | public DirectRandomAccessFile(File file, String mode) 23 | throws IOException { 24 | 25 | boolean readOnly = false; 26 | if (mode.equals("r")) { 27 | readOnly = true; 28 | } else if (!mode.equals("rw")) { 29 | throw new IllegalArgumentException("only r and rw modes supported"); 30 | } 31 | 32 | if (readOnly && !file.isFile()) { 33 | throw new FileNotFoundException("couldn't find file " + file); 34 | } 35 | 36 | this.channel = DirectChannelImpl.getChannel(file, readOnly); 37 | } 38 | 39 | @Override 40 | public void close() throws IOException { 41 | channel.close(); 42 | } 43 | 44 | 45 | public int write(ByteBuffer src, long position) throws IOException { 46 | return channel.write(src, position); 47 | } 48 | 49 | public int read(ByteBuffer dst, long position) throws IOException { 50 | return channel.read(dst, position); 51 | } 52 | 53 | /** 54 | * @return The current position in the file 55 | */ 56 | public long getFilePointer() { 57 | return channel.getFD(); 58 | } 59 | 60 | /** 61 | * @return The current length of the file 62 | */ 63 | public long length() { 64 | return channel.size(); 65 | } 66 | 67 | } -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/directio/OpenFlags.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.directio; 2 | 3 | /** 4 | * Constants for {@link DirectIOLib#oDirectOpen(String, boolean)}.

5 | */ 6 | public final class OpenFlags { 7 | public static final int O_RDONLY = 00; 8 | public static final int O_WRONLY = 01; 9 | public static final int O_RDWR = 02; 10 | public static final int O_CREAT = 0100; 11 | public static final int O_TRUNC = 01000; 12 | public static final int O_DIRECT = 040000; 13 | public static final int O_SYNC = 04000000; 14 | 15 | private OpenFlags() {} 16 | } 17 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/KiritoDB.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb; 2 | 3 | import com.alibabacloud.polar_race.engine.common.AbstractVisitor; 4 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 5 | import com.alibabacloud.polar_race.engine.common.exceptions.RetCodeEnum; 6 | import moe.cnkirito.kiritodb.common.Constant; 7 | import moe.cnkirito.kiritodb.common.Util; 8 | import moe.cnkirito.kiritodb.data.CommitLog; 9 | import moe.cnkirito.kiritodb.index.CommitLogIndex; 10 | import moe.cnkirito.kiritodb.partition.HighTenPartitioner; 11 | import moe.cnkirito.kiritodb.partition.Partitionable; 12 | import moe.cnkirito.kiritodb.range.CacheItem; 13 | import moe.cnkirito.kiritodb.range.FetchDataProducer; 14 | import moe.cnkirito.kiritodb.range.RangeTask; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import java.io.IOException; 19 | import java.nio.ByteBuffer; 20 | import java.util.concurrent.CountDownLatch; 21 | import java.util.concurrent.LinkedBlockingQueue; 22 | import java.util.concurrent.atomic.AtomicBoolean; 23 | 24 | /** 25 | * @author kirito.moe@foxmail.com 26 | * Date 2018-10-28 27 | */ 28 | public class KiritoDB { 29 | 30 | private static final Logger logger = LoggerFactory.getLogger(KiritoDB.class); 31 | // partition num 32 | private final int partitionNum = Constant.partitionNum; 33 | // key -> partition 34 | private volatile Partitionable partitionable; 35 | // data 36 | public volatile CommitLog[] commitLogs; 37 | // index 38 | private volatile CommitLogIndex[] commitLogIndices; 39 | // true means need to load index into memory, false means no need 40 | private volatile boolean loadFlag = false; 41 | 42 | public KiritoDB() { 43 | partitionable = new HighTenPartitioner(); 44 | } 45 | 46 | public void open(String path) throws EngineException { 47 | if (path.endsWith("/")) { 48 | path = path.substring(0, path.length() - 1); 49 | } 50 | commitLogs = new CommitLog[partitionNum]; 51 | commitLogIndices = new CommitLogIndex[partitionNum]; 52 | try { 53 | for (int i = 0; i < partitionNum; i++) { 54 | commitLogs[i] = new CommitLog(); 55 | commitLogs[i].init(path, i); 56 | } 57 | for (int i = 0; i < partitionNum; i++) { 58 | commitLogIndices[i] = new CommitLogIndex(); 59 | commitLogIndices[i].init(path, i); 60 | commitLogIndices[i].setCommitLog(commitLogs[i]); 61 | this.loadFlag = commitLogIndices[i].isLoadFlag(); 62 | } 63 | if (!loadFlag) { 64 | loadAllIndex(); 65 | } 66 | } catch (IOException e) { 67 | throw new EngineException(RetCodeEnum.IO_ERROR, "open exception"); 68 | } 69 | } 70 | 71 | public void write(byte[] key, byte[] value) throws EngineException { 72 | int partition = partitionable.getPartition(key); 73 | CommitLog hitCommitLog = commitLogs[partition]; 74 | CommitLogIndex hitIndex = commitLogIndices[partition]; 75 | synchronized (hitCommitLog) { 76 | hitCommitLog.write(value); 77 | hitIndex.write(key); 78 | } 79 | } 80 | 81 | public byte[] read(byte[] key) throws EngineException { 82 | int partition = partitionable.getPartition(key); 83 | CommitLog hitCommitLog = commitLogs[partition]; 84 | CommitLogIndex hitIndex = commitLogIndices[partition]; 85 | Long offset = hitIndex.read(key); 86 | if (offset == null) { 87 | throw new EngineException(RetCodeEnum.NOT_FOUND, Util.bytes2Long(key) + " not found"); 88 | } 89 | try { 90 | return hitCommitLog.read(offset); 91 | } catch (IOException e) { 92 | throw new EngineException(RetCodeEnum.IO_ERROR, "commit log read exception"); 93 | } 94 | } 95 | 96 | // fetch thread flag 97 | private final AtomicBoolean rangFirst = new AtomicBoolean(false); 98 | private static ThreadLocal visitorCallbackValue = ThreadLocal.withInitial(() -> new byte[Constant.VALUE_LENGTH]); 99 | private static ThreadLocal visitorCallbackKey = ThreadLocal.withInitial(() -> new byte[Constant.INDEX_LENGTH]); 100 | private final static int THREAD_NUM = 64; 101 | private LinkedBlockingQueue rangeTaskLinkedBlockingQueue = new LinkedBlockingQueue<>(); 102 | 103 | public void range(byte[] lower, byte[] upper, AbstractVisitor visitor) throws EngineException { 104 | // 第一次 range 的时候开启 fetch 线程 105 | if (rangFirst.compareAndSet(false, true)) { 106 | // logger.info("[jvm info] range first now {} ", Util.getFreeMemory()); 107 | initPreFetchThreads(); 108 | } 109 | RangeTask rangeTask = new RangeTask(visitor, new CountDownLatch(1)); 110 | rangeTaskLinkedBlockingQueue.offer(rangeTask); 111 | try { 112 | rangeTask.getCountDownLatch().await(); 113 | } catch (InterruptedException e) { 114 | e.printStackTrace(); 115 | } 116 | } 117 | 118 | private volatile FetchDataProducer fetchDataProducer; 119 | 120 | private void initPreFetchThreads() { 121 | Thread fetchThread = new Thread(() -> { 122 | fetchDataProducer = new FetchDataProducer(this); 123 | for (int f = 0; f < 2; f++) { 124 | RangeTask[] rangeTasks = new RangeTask[THREAD_NUM]; 125 | for (int i = 0; i < THREAD_NUM; i++) { 126 | try { 127 | rangeTasks[i] = rangeTaskLinkedBlockingQueue.take(); 128 | } catch (InterruptedException e) { 129 | e.printStackTrace(); 130 | } 131 | } 132 | fetchDataProducer.initFetch(); 133 | fetchDataProducer.startFetch(); 134 | for (int i = 0; i < THREAD_NUM; i++) { 135 | final int rangeIndex = i; 136 | Thread thread = new Thread(() -> { 137 | RangeTask myTask = rangeTasks[rangeIndex]; 138 | for (int dbIndex = 0; dbIndex < partitionNum; dbIndex++) { 139 | CacheItem cacheItem; 140 | while (true) { 141 | cacheItem = fetchDataProducer.getCacheItem(dbIndex); 142 | if (cacheItem != null) { 143 | break; 144 | } 145 | sleep1us(); 146 | } 147 | while (true) { 148 | if (cacheItem.ready) { 149 | break; 150 | } 151 | sleep1us(); 152 | } 153 | byte[] value = visitorCallbackValue.get(); 154 | byte[] key = visitorCallbackKey.get(); 155 | ByteBuffer valueCache = cacheItem.buffer.slice(); 156 | int keySize = commitLogIndices[dbIndex].getMemoryIndex().getSize(); 157 | int[] offset = commitLogIndices[dbIndex].getMemoryIndex().getOffset(); 158 | long[] keys = commitLogIndices[dbIndex].getMemoryIndex().getKeys(); 159 | for (int j = 0; j < keySize; j++) { 160 | valueCache.position(offset[j] * Constant.VALUE_LENGTH); 161 | valueCache.get(value); 162 | Util.long2bytes(key, keys[j]); 163 | rangeTasks[rangeIndex].getAbstractVisitor().visit(key, value); 164 | } 165 | while (true) { 166 | if (cacheItem.allReach) { 167 | break; 168 | } 169 | sleep1us(); 170 | } 171 | fetchDataProducer.release(dbIndex); 172 | } 173 | myTask.getCountDownLatch().countDown(); 174 | }); 175 | thread.setDaemon(true); 176 | thread.start(); 177 | } 178 | } 179 | }); 180 | fetchThread.setDaemon(true); 181 | fetchThread.start(); 182 | } 183 | 184 | private void sleep1us() { 185 | try { 186 | Thread.sleep(0, 1); 187 | } catch (InterruptedException e) { 188 | e.printStackTrace(); 189 | } 190 | } 191 | 192 | private void loadAllIndex() { 193 | int loadThreadNum = THREAD_NUM; 194 | CountDownLatch countDownLatch = new CountDownLatch(loadThreadNum); 195 | for (int i = 0; i < loadThreadNum; i++) { 196 | final int index = i; 197 | new Thread(() -> { 198 | for (int partition = 0; partition < partitionNum; partition++) { 199 | if (partition % loadThreadNum == index) { 200 | commitLogIndices[partition].load(); 201 | } 202 | } 203 | countDownLatch.countDown(); 204 | }).start(); 205 | } 206 | try { 207 | countDownLatch.await(); 208 | } catch (InterruptedException e) { 209 | logger.error("load index interrupted", e); 210 | } 211 | this.loadFlag = true; 212 | } 213 | 214 | public void close() { 215 | if (commitLogs != null) { 216 | for (CommitLog commitLog : commitLogs) { 217 | try { 218 | commitLog.destroy(); 219 | } catch (IOException e) { 220 | logger.error("data destroy error", e); 221 | } 222 | } 223 | } 224 | if (commitLogIndices != null) { 225 | for (CommitLogIndex commitLogIndex : commitLogIndices) { 226 | try { 227 | commitLogIndex.destroy(); 228 | } catch (IOException e) { 229 | logger.error("data destroy error", e); 230 | } 231 | } 232 | } 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/common/Constant.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.common; 2 | 3 | import moe.cnkirito.directio.DirectIOLib; 4 | 5 | public class Constant { 6 | 7 | public static final String DATA_PREFIX = "/data"; 8 | public static final String DATA_SUFFIX = ".polar"; 9 | public static final String INDEX_PREFIX = "/index"; 10 | public static final String INDEX_SUFFIX = ".polar"; 11 | public static final int VALUE_LENGTH = 4 * 1024; 12 | public static final int INDEX_LENGTH = 8; 13 | public static final int _4kb = 4 * 1024; 14 | 15 | public static int expectedNumPerPartition = 64000; 16 | public static int partitionNum = 1 << 10; 17 | 18 | public static DirectIOLib directIOLib = DirectIOLib.getLibForPath("test_directory"); 19 | 20 | } 21 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/common/LoopQuerySemaphore.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.common; 2 | 3 | /** 4 | * @author daofeng.xjf 5 | * @date 2018/11/30 6 | */ 7 | public class LoopQuerySemaphore { 8 | 9 | private volatile boolean permits; 10 | 11 | public LoopQuerySemaphore(int permits) { 12 | if (permits > 0) { 13 | this.permits = true; 14 | } else { 15 | this.permits = false; 16 | } 17 | } 18 | 19 | public void acquire() throws InterruptedException { 20 | while (!permits) { 21 | Thread.sleep(0,1); 22 | } 23 | permits = false; 24 | } 25 | 26 | public void acquireNoSleep() throws InterruptedException { 27 | while (!permits) { 28 | } 29 | permits = false; 30 | } 31 | 32 | public void release() { 33 | permits = true; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/common/UnsafeUtil.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.common; 2 | 3 | import sun.misc.Unsafe; 4 | 5 | import java.lang.reflect.Field; 6 | 7 | public class UnsafeUtil { 8 | 9 | public static final Unsafe UNSAFE; 10 | 11 | static { 12 | try { 13 | Field field = Unsafe.class.getDeclaredField("theUnsafe"); 14 | field.setAccessible(true); 15 | UNSAFE = (Unsafe) field.get(null); 16 | } catch (Exception e) { 17 | throw new RuntimeException(e); 18 | } 19 | } 20 | 21 | } -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/common/Util.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.common; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.lang.management.ManagementFactory; 8 | import java.lang.reflect.Method; 9 | import java.nio.ByteBuffer; 10 | import java.nio.MappedByteBuffer; 11 | import java.security.AccessController; 12 | import java.security.PrivilegedAction; 13 | import java.text.SimpleDateFormat; 14 | import java.util.Date; 15 | 16 | public class Util { 17 | 18 | public static String getFreeMemory() { 19 | long free = Runtime.getRuntime().freeMemory() / 1024 / 1024; 20 | long total = Runtime.getRuntime().totalMemory() / 1024 / 1024; 21 | long max = Runtime.getRuntime().maxMemory() / 1024 / 1024; 22 | return "free=" + free + "M,total=" + total + "M,max=" + max + "M"; 23 | } 24 | 25 | /** 26 | * 当前时间 27 | * 28 | * @return 29 | */ 30 | public static String curTime() { 31 | SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式 32 | // new Date()为获取当前系统时间 33 | return df.format(new Date()); 34 | } 35 | 36 | /** 37 | * 当前进程 38 | * 39 | * @return 40 | */ 41 | public static String pid() { 42 | String name = ManagementFactory.getRuntimeMXBean().getName(); 43 | String pid = name.split("@")[0]; 44 | return pid; 45 | } 46 | 47 | /** 48 | * 执行shell指令 49 | * 50 | * @param cmd 51 | * @return 52 | * @throws IOException 53 | */ 54 | public static String runCmd(String cmd) throws IOException { 55 | Process process = Runtime.getRuntime().exec(cmd); 56 | InputStream is = process.getInputStream(); 57 | BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 58 | StringBuffer sb = new StringBuffer(); 59 | String tmp; 60 | int index = 0; 61 | while ((tmp = reader.readLine()) != null && index < 20) { 62 | sb.append(tmp).append("\n"); 63 | ++index; 64 | } 65 | process.destroy(); 66 | return sb.toString(); 67 | } 68 | 69 | /** 70 | * bytes转long 71 | * 72 | * @param buffer 73 | * @return 74 | */ 75 | public static long bytes2Long(byte[] buffer) { 76 | long values = 0; 77 | int len = 8; 78 | for (int i = 0; i < len; ++i) { 79 | values <<= 8; 80 | values |= (buffer[i] & 0xff); 81 | } 82 | return values; 83 | } 84 | 85 | public static void long2bytes(byte[] buffer, long value) { 86 | for (int i = 0; i < 8; ++i) { 87 | int offset = 64 - (i + 1) * 8; 88 | buffer[i] = (byte) ((value >> offset) & 0xff); 89 | } 90 | } 91 | 92 | /** 93 | * long转bytes 94 | * 95 | * @param values 96 | * @return 97 | */ 98 | public static byte[] long2bytes(long values) { 99 | byte[] buffer = new byte[8]; 100 | for (int i = 0; i < 8; ++i) { 101 | int offset = 64 - (i + 1) * 8; 102 | buffer[i] = (byte) ((values >> offset) & 0xff); 103 | } 104 | return buffer; 105 | } 106 | 107 | /** 108 | * long转bytes 109 | * 110 | * @param values 111 | * @return 112 | */ 113 | public static byte[] int2bytes(int values) { 114 | byte[] buffer = new byte[4]; 115 | for (int i = 0; i < 4; ++i) { 116 | int offset = 32 - (i + 1) * 8; 117 | buffer[i] = (byte) ((values >> offset) & 0xff); 118 | } 119 | return buffer; 120 | } 121 | 122 | /** 123 | * 模拟随机生成的4kb字节 124 | * 125 | * @param l 126 | * @return 127 | */ 128 | public static byte[] _4kb(long l) { 129 | ByteBuffer buffer = ByteBuffer.allocate(4 * 1024); 130 | buffer.putLong(l); 131 | for (int i = 0; i < 4048 - 8; ++i) { 132 | buffer.put((byte) 0); 133 | } 134 | return buffer.array(); 135 | } 136 | 137 | public static void clean(MappedByteBuffer mappedByteBuffer) { 138 | ByteBuffer buffer = mappedByteBuffer; 139 | if (buffer == null || !buffer.isDirect() || buffer.capacity() == 0) 140 | return; 141 | invoke(invoke(viewed(buffer), "cleaner"), "clean"); 142 | } 143 | 144 | private static Object invoke(final Object target, final String methodName, final Class... args) { 145 | return AccessController.doPrivileged(new PrivilegedAction() { 146 | public Object run() { 147 | try { 148 | Method method = method(target, methodName, args); 149 | method.setAccessible(true); 150 | return method.invoke(target); 151 | } catch (Exception e) { 152 | throw new IllegalStateException(e); 153 | } 154 | } 155 | }); 156 | } 157 | 158 | private static Method method(Object target, String methodName, Class[] args) 159 | throws NoSuchMethodException { 160 | try { 161 | return target.getClass().getMethod(methodName, args); 162 | } catch (NoSuchMethodException e) { 163 | return target.getClass().getDeclaredMethod(methodName, args); 164 | } 165 | } 166 | 167 | private static ByteBuffer viewed(ByteBuffer buffer) { 168 | String methodName = "viewedBuffer"; 169 | Method[] methods = buffer.getClass().getMethods(); 170 | for (int i = 0; i < methods.length; i++) { 171 | if (methods[i].getName().equals("attachment")) { 172 | methodName = "attachment"; 173 | break; 174 | } 175 | } 176 | ByteBuffer viewedBuffer = (ByteBuffer) invoke(buffer, methodName); 177 | if (viewedBuffer == null) 178 | return buffer; 179 | else 180 | return viewed(viewedBuffer); 181 | } 182 | 183 | } 184 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/data/CommitLog.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.data; 2 | 3 | import com.alibabacloud.polar_race.engine.common.exceptions.EngineException; 4 | import com.alibabacloud.polar_race.engine.common.exceptions.RetCodeEnum; 5 | import moe.cnkirito.directio.DirectIOLib; 6 | import moe.cnkirito.directio.DirectIOUtils; 7 | import moe.cnkirito.kiritodb.common.Constant; 8 | import net.smacke.jaydio.DirectRandomAccessFile; 9 | import sun.misc.Contended; 10 | import sun.nio.ch.DirectBuffer; 11 | 12 | import java.io.File; 13 | import java.io.IOException; 14 | import java.io.RandomAccessFile; 15 | import java.nio.ByteBuffer; 16 | import java.nio.channels.FileChannel; 17 | 18 | import static moe.cnkirito.kiritodb.common.UnsafeUtil.UNSAFE; 19 | 20 | /** 21 | * @author kirito.moe@foxmail.com 22 | * Date 2018-10-28 23 | */ 24 | @Contended 25 | public class CommitLog { 26 | 27 | // buffer 28 | private static ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> ByteBuffer.allocate(Constant.VALUE_LENGTH)); 29 | private static ThreadLocal byteArrayThreadLocal = ThreadLocal.withInitial(() -> new byte[Constant.VALUE_LENGTH]); 30 | private FileChannel fileChannel; 31 | private DirectRandomAccessFile directRandomAccessFile; 32 | private moe.cnkirito.directio.DirectRandomAccessFile directFileForRange; 33 | private ByteBuffer writeBuffer; 34 | /** 35 | * we want to use {@link sun.misc.Unsafe} to copy memory, 36 | */ 37 | private long writeBufferAddress; 38 | private boolean dioSupport; 39 | /** 40 | * file write pointer 41 | */ 42 | private long wrotePosition; 43 | /** 44 | * buffer write pointer 45 | */ 46 | private int bufferPosition; 47 | 48 | public void init(String path, int no) throws IOException { 49 | File dirFile = new File(path); 50 | if (!dirFile.exists()) { 51 | dirFile.mkdirs(); 52 | } 53 | File file = new File(path + Constant.DATA_PREFIX + no + Constant.DATA_SUFFIX); 54 | if (!file.exists()) { 55 | file.createNewFile(); 56 | } 57 | this.fileChannel = new RandomAccessFile(file, "rw").getChannel(); 58 | try { 59 | this.directRandomAccessFile = new DirectRandomAccessFile(file, "r"); 60 | this.dioSupport = true; 61 | } catch (Exception e) { 62 | this.dioSupport = false; 63 | } 64 | if (DirectIOLib.binit) { 65 | directFileForRange = new moe.cnkirito.directio.DirectRandomAccessFile(file, "rw"); 66 | this.writeBuffer = DirectIOUtils.allocateForDirectIO(Constant.directIOLib, Constant.VALUE_LENGTH * 4); 67 | } else { 68 | this.writeBuffer = ByteBuffer.allocateDirect(Constant.VALUE_LENGTH * 4); 69 | } 70 | 71 | this.writeBufferAddress = ((DirectBuffer) this.writeBuffer).address(); 72 | this.wrotePosition = 0; 73 | this.bufferPosition = 0; 74 | 75 | } 76 | 77 | public synchronized byte[] read(long offset) throws IOException { 78 | if (this.dioSupport) { 79 | byte[] bytes = byteArrayThreadLocal.get(); 80 | directRandomAccessFile.seek(offset); 81 | directRandomAccessFile.read(bytes); 82 | return bytes; 83 | } else { 84 | ByteBuffer buffer = bufferThreadLocal.get(); 85 | buffer.clear(); 86 | this.fileChannel.read(buffer, offset); 87 | return buffer.array(); 88 | } 89 | } 90 | 91 | public synchronized void write(byte[] data) throws EngineException { 92 | UNSAFE.copyMemory(data, 16, null, writeBufferAddress + bufferPosition * Constant.VALUE_LENGTH, Constant.VALUE_LENGTH); 93 | bufferPosition++; 94 | if (bufferPosition >= 4) { 95 | this.writeBuffer.position(0); 96 | this.writeBuffer.limit(bufferPosition * Constant.VALUE_LENGTH); 97 | if (DirectIOLib.binit) { 98 | try { 99 | this.directFileForRange.write(writeBuffer, this.wrotePosition); 100 | } catch (IOException e) { 101 | throw new EngineException(RetCodeEnum.IO_ERROR, "direct write data io error"); 102 | } 103 | } else { 104 | try { 105 | this.fileChannel.write(this.writeBuffer); 106 | } catch (IOException e) { 107 | throw new EngineException(RetCodeEnum.IO_ERROR, "fileChannel write data io error"); 108 | } 109 | } 110 | this.wrotePosition += Constant.VALUE_LENGTH * bufferPosition; 111 | bufferPosition = 0; 112 | } 113 | } 114 | 115 | /** 116 | * 加载整个data文件进入内存 117 | */ 118 | public void loadAll(ByteBuffer buffer) throws IOException { 119 | buffer.clear(); 120 | if (DirectIOLib.binit) { 121 | if (directRandomAccessFile.length() > 0) { 122 | directFileForRange.read(buffer, 0); 123 | } 124 | // no need flip 125 | } else { 126 | this.fileChannel.read(buffer, 0); 127 | buffer.flip(); 128 | } 129 | } 130 | 131 | public int getFileLength() { 132 | try { 133 | return (int) (this.fileChannel.size() / Constant.VALUE_LENGTH); 134 | } catch (IOException e) { 135 | return 0; 136 | } 137 | } 138 | 139 | public void destroy() throws IOException { 140 | if (bufferPosition > 0) { 141 | this.writeBuffer.position(0); 142 | this.writeBuffer.limit(bufferPosition * Constant.VALUE_LENGTH); 143 | if (DirectIOLib.binit) { 144 | this.directFileForRange.write(writeBuffer, this.wrotePosition); 145 | } else { 146 | this.fileChannel.write(this.writeBuffer); 147 | } 148 | this.wrotePosition += Constant.VALUE_LENGTH * bufferPosition; 149 | bufferPosition = 0; 150 | } 151 | this.writeBuffer = null; 152 | if (this.fileChannel != null) { 153 | this.fileChannel.close(); 154 | } 155 | if (this.directRandomAccessFile != null) { 156 | this.directRandomAccessFile.close(); 157 | } 158 | } 159 | 160 | } 161 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/data/CommitLogAware.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.data; 2 | 3 | public interface CommitLogAware { 4 | void setCommitLog(CommitLog commitLog); 5 | } 6 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/index/ArrayMemoryIndex.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.index; 2 | 3 | /** 4 | * the implementation of memory index using java origin array 5 | */ 6 | public class ArrayMemoryIndex implements MemoryIndex { 7 | 8 | private long keys[]; 9 | private int offset[]; 10 | private int indexSize; 11 | 12 | public ArrayMemoryIndex(int initSize) { 13 | this.keys = new long[initSize]; 14 | this.offset = new int[initSize]; 15 | this.indexSize = initSize; 16 | } 17 | 18 | @Override 19 | public int getSize() { 20 | return this.indexSize; 21 | } 22 | 23 | @Override 24 | public void init() { 25 | this.sortAndCompact(); 26 | } 27 | 28 | @Override 29 | public void insertIndexCache(long key, int value) { 30 | this.keys[value] = key; 31 | this.offset[value] = value; 32 | } 33 | 34 | @Override 35 | public int get(long key) { 36 | return this.binarySearchPosition(key); 37 | } 38 | 39 | @Override 40 | public long[] getKeys() { 41 | return this.keys; 42 | } 43 | 44 | @Override 45 | public int[] getOffset() { 46 | return this.offset; 47 | } 48 | 49 | /** 50 | * sort the index and compact the same key 51 | */ 52 | private void sortAndCompact() { 53 | if (this.indexSize != 0) { 54 | sort(0, this.indexSize - 1); 55 | if (this.indexSize > 60000 && this.indexSize < 64000) { 56 | return; 57 | } 58 | compact(); 59 | } 60 | } 61 | 62 | private void compact() { 63 | long[] newKeys = new long[indexSize]; 64 | int[] newOffsetInts = new int[indexSize]; 65 | 66 | int curIndex = 0; 67 | newOffsetInts[0] = this.offset[0]; 68 | newKeys[0] = this.keys[0]; 69 | for (int i = 1; i < this.indexSize; i++) { 70 | if (this.keys[i] != this.keys[i - 1]) { 71 | curIndex++; 72 | newKeys[curIndex] = this.keys[i]; 73 | newOffsetInts[curIndex] = this.offset[i]; 74 | } else { 75 | newOffsetInts[curIndex] = Math.max(newOffsetInts[curIndex], this.offset[i]); 76 | } 77 | } 78 | this.indexSize = curIndex + 1; 79 | this.offset = newOffsetInts; 80 | this.keys = newKeys; 81 | } 82 | 83 | private int binarySearchPosition(long key) { 84 | int index = this.binarySearch(0, indexSize, key); 85 | if (index >= 0) { 86 | return this.offset[index]; 87 | } else { 88 | return -1; 89 | } 90 | } 91 | 92 | private void sort(int low, int high) { 93 | int start = low; 94 | int end = high; 95 | long key = this.keys[low]; 96 | 97 | while (end > start) { 98 | while (end > start && this.keys[end] >= key) 99 | end--; 100 | if (this.keys[end] <= key) { 101 | swap(start, end); 102 | } 103 | //从前往后比较 104 | while (end > start && this.keys[start] <= key) 105 | start++; 106 | if (this.keys[start] >= key) { 107 | swap(start, end); 108 | } 109 | } 110 | if (start > low) sort(low, start - 1); 111 | if (end < high) sort(end + 1, high); 112 | } 113 | 114 | 115 | private int binarySearch(int fromIndex, int toIndex, long key) { 116 | int low = fromIndex; 117 | int high = toIndex - 1; 118 | 119 | while (low <= high) { 120 | int mid = (low + high) >>> 1; 121 | long midVal = this.keys[mid]; 122 | int cmp = Long.compare(midVal, key); 123 | if (cmp < 0) 124 | low = mid + 1; 125 | else if (cmp > 0) 126 | high = mid - 1; 127 | else 128 | return mid; // keys found 129 | } 130 | return -(low + 1); // keys not found. 131 | } 132 | 133 | private void swap(int i, int j) { 134 | if (i == j) return; 135 | keys[i] ^= keys[j]; 136 | keys[j] ^= keys[i]; 137 | keys[i] ^= keys[j]; 138 | 139 | offset[i] ^= offset[j]; 140 | offset[j] ^= offset[i]; 141 | offset[i] ^= offset[j]; 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/index/CommitLogIndex.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.index; 2 | 3 | import moe.cnkirito.directio.DirectIOLib; 4 | import moe.cnkirito.kiritodb.common.Constant; 5 | import moe.cnkirito.kiritodb.common.Util; 6 | import moe.cnkirito.kiritodb.data.CommitLog; 7 | import moe.cnkirito.kiritodb.data.CommitLogAware; 8 | import net.smacke.jaydio.DirectRandomAccessFile; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | import sun.misc.Contended; 12 | import sun.nio.ch.DirectBuffer; 13 | 14 | import java.io.File; 15 | import java.io.IOException; 16 | import java.io.RandomAccessFile; 17 | import java.nio.ByteBuffer; 18 | import java.nio.MappedByteBuffer; 19 | import java.nio.channels.FileChannel; 20 | 21 | import static moe.cnkirito.kiritodb.common.Constant._4kb; 22 | import static moe.cnkirito.kiritodb.common.UnsafeUtil.UNSAFE; 23 | 24 | /** 25 | * @author kirito.moe@foxmail.com 26 | * Date 2018-10-28 27 | */ 28 | @Contended 29 | public class CommitLogIndex implements CommitLogAware { 30 | 31 | private final static Logger logger = LoggerFactory.getLogger(CommitLogIndex.class); 32 | // memory index dataStructure 33 | private MemoryIndex memoryIndex; 34 | private FileChannel fileChannel; 35 | private MappedByteBuffer mappedByteBuffer; 36 | private DirectRandomAccessFile directRandomAccessFile; 37 | // mmap byteBuffer start address 38 | private long address; 39 | // 当前索引写入的区域 40 | private CommitLog commitLog; 41 | // determine current index block is loaded into memory 42 | private volatile boolean loadFlag = false; 43 | private long wrotePosition; 44 | 45 | public void init(String path, int no) throws IOException { 46 | File dirFile = new File(path); 47 | if (!dirFile.exists()) { 48 | dirFile.mkdirs(); 49 | loadFlag = true; 50 | } 51 | File file = new File(path + Constant.INDEX_PREFIX + no + Constant.INDEX_SUFFIX); 52 | if (!file.exists()) { 53 | file.createNewFile(); 54 | loadFlag = true; 55 | } 56 | this.fileChannel = new RandomAccessFile(file, "rw").getChannel(); 57 | if(DirectIOLib.binit){ 58 | directRandomAccessFile = new DirectRandomAccessFile(file, "r"); 59 | } 60 | } 61 | 62 | public void load() { 63 | int indexSize = commitLog.getFileLength(); 64 | this.memoryIndex = new ArrayMemoryIndex(indexSize); 65 | if (indexSize == 0) { 66 | return; 67 | } 68 | if (DirectIOLib.binit) { 69 | ByteBuffer buffer = ByteBuffer.allocate((indexSize * Constant.INDEX_LENGTH / _4kb + 1) * _4kb); 70 | try { 71 | directRandomAccessFile.read(buffer.array()); 72 | } catch (IOException e) { 73 | logger.error("load index failed", e); 74 | } 75 | buffer.position(0); 76 | buffer.limit(indexSize * Constant.INDEX_LENGTH); 77 | for (int curIndex = 0; curIndex < indexSize; curIndex++) { 78 | buffer.position(curIndex * Constant.INDEX_LENGTH); 79 | long key = buffer.getLong(); 80 | this.memoryIndex.insertIndexCache(key, curIndex); 81 | } 82 | } else { 83 | ByteBuffer buffer = ByteBuffer.allocateDirect(indexSize * Constant.INDEX_LENGTH); 84 | try { 85 | fileChannel.read(buffer); 86 | } catch (IOException e) { 87 | logger.error("load index failed", e); 88 | } 89 | buffer.flip(); 90 | for (int curIndex = 0; curIndex < indexSize; curIndex++) { 91 | buffer.position(curIndex * Constant.INDEX_LENGTH); 92 | long key = buffer.getLong(); 93 | this.memoryIndex.insertIndexCache(key, curIndex); 94 | } 95 | ((DirectBuffer) buffer).cleaner().clean(); 96 | } 97 | memoryIndex.init(); 98 | this.loadFlag = true; 99 | } 100 | 101 | public void releaseFile() throws IOException { 102 | if (this.mappedByteBuffer != null) { 103 | fileChannel.close(); 104 | Util.clean(this.mappedByteBuffer); 105 | this.fileChannel = null; 106 | this.mappedByteBuffer = null; 107 | } 108 | } 109 | 110 | public void destroy() throws IOException { 111 | commitLog = null; 112 | loadFlag = false; 113 | releaseFile(); 114 | } 115 | 116 | public Long read(byte[] key) { 117 | int offsetInt = this.memoryIndex.get(Util.bytes2Long(key)); 118 | if (offsetInt < 0) { 119 | return null; 120 | } 121 | return ((long) offsetInt) * Constant.VALUE_LENGTH; 122 | } 123 | 124 | public void write(byte[] key) { 125 | if (this.mappedByteBuffer == null) { 126 | try { 127 | this.mappedByteBuffer = this.fileChannel.map(FileChannel.MapMode.READ_WRITE, 0, Constant.INDEX_LENGTH * Constant.expectedNumPerPartition); 128 | } catch (IOException e) { 129 | logger.error("mmap failed", e); 130 | } 131 | this.address = ((DirectBuffer) mappedByteBuffer).address(); 132 | this.wrotePosition = 0; 133 | } 134 | if (this.wrotePosition >= this.mappedByteBuffer.limit() - Constant.INDEX_LENGTH) { 135 | try { 136 | this.mappedByteBuffer = this.fileChannel.map(FileChannel.MapMode.READ_WRITE, 0, Constant.INDEX_LENGTH * 203000); 137 | } catch (IOException e) { 138 | logger.error("mmap failed", e); 139 | } 140 | this.address = ((DirectBuffer) mappedByteBuffer).address(); 141 | } 142 | UNSAFE.copyMemory(key, 16, null, address + wrotePosition, Constant.INDEX_LENGTH); 143 | wrotePosition += Constant.INDEX_LENGTH; 144 | } 145 | 146 | public boolean isLoadFlag() { 147 | return loadFlag; 148 | } 149 | 150 | @Override 151 | public void setCommitLog(CommitLog commitLog) { 152 | this.commitLog = commitLog; 153 | } 154 | 155 | public MemoryIndex getMemoryIndex() { 156 | return memoryIndex; 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/index/HppcMemoryIndex.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.index; 2 | 3 | import com.carrotsearch.hppc.LongIntHashMap; 4 | 5 | /** 6 | * the implementation of memory index using ${@link com.carrotsearch.hppc.LongIntHashMap} 7 | */ 8 | public class HppcMemoryIndex implements MemoryIndex { 9 | 10 | private LongIntHashMap indexMap; 11 | 12 | public HppcMemoryIndex() { 13 | this.indexMap = new LongIntHashMap(); 14 | } 15 | 16 | @Override 17 | public int getSize() { 18 | return indexMap.size(); 19 | } 20 | 21 | @Override 22 | public void init() { 23 | //do nothing 24 | } 25 | 26 | @Override 27 | public void insertIndexCache(long key, int value) { 28 | this.indexMap.put(key, value); 29 | } 30 | 31 | @Override 32 | public int get(long key) { 33 | return this.indexMap.getOrDefault(key, -1); 34 | } 35 | 36 | @Override 37 | public long[] getKeys() { 38 | throw new UnsupportedOperationException("getKeys() unsupported"); 39 | } 40 | 41 | @Override 42 | public int[] getOffset() { 43 | throw new UnsupportedOperationException("getOffset() unsupported"); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/index/MemoryIndex.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.index; 2 | 3 | /** 4 | * save index in the memory 5 | */ 6 | public interface MemoryIndex { 7 | int getSize(); 8 | void init(); 9 | void insertIndexCache(long key, int value); 10 | int get(long key); 11 | long[] getKeys(); 12 | int[] getOffset(); 13 | } 14 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/partition/FirstBytePartitoner.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.partition; 2 | 3 | public class FirstBytePartitoner implements Partitionable { 4 | @Override 5 | public int getPartition(byte[] key) { 6 | return key[0] & 0xff; 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/partition/HighTenPartitioner.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.partition; 2 | 3 | /** 4 | * using high ten bit of the given key to determine which file it hits. 5 | */ 6 | public class HighTenPartitioner implements Partitionable { 7 | @Override 8 | public int getPartition(byte[] key) { 9 | return ((key[0] & 0xff) << 2) | ((key[1] & 0xff) >> 6); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/partition/Partitionable.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.partition; 2 | 3 | /** 4 | * determine how to hash the key 5 | */ 6 | public interface Partitionable { 7 | int getPartition(byte[] key); 8 | } 9 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/range/CacheItem.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.range; 2 | 3 | import java.nio.ByteBuffer; 4 | 5 | /** 6 | * the cache of one partition 7 | */ 8 | public class CacheItem { 9 | 10 | public volatile int dbIndex; 11 | public volatile int useRef; 12 | public volatile boolean ready; 13 | public volatile boolean allReach; 14 | public volatile ByteBuffer buffer; 15 | 16 | } 17 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/range/FetchDataProducer.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.range; 2 | 3 | import moe.cnkirito.directio.DirectIOLib; 4 | import moe.cnkirito.directio.DirectIOUtils; 5 | import moe.cnkirito.kiritodb.KiritoDB; 6 | import moe.cnkirito.kiritodb.common.Constant; 7 | import moe.cnkirito.kiritodb.data.CommitLog; 8 | import net.openhft.affinity.AffinityLock; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import java.io.IOException; 13 | import java.nio.ByteBuffer; 14 | import java.util.concurrent.locks.Lock; 15 | import java.util.concurrent.locks.ReentrantLock; 16 | 17 | /** 18 | * producer to fetch data on the disk file into ${@link CacheItem} 19 | */ 20 | public class FetchDataProducer { 21 | 22 | private final static Logger logger = LoggerFactory.getLogger(FetchDataProducer.class); 23 | 24 | /** 25 | * the windowsNum determine the num of CacheItem 26 | */ 27 | private int windowsNum; 28 | private CacheItem[] cacheItems; 29 | private CommitLog[] commitLogs; 30 | private Lock lock; 31 | 32 | public FetchDataProducer(KiritoDB kiritoDB) { 33 | lock = new ReentrantLock(); 34 | int expectedNumPerPartition = kiritoDB.commitLogs[0].getFileLength(); 35 | for (int i = 1; i < Constant.partitionNum; i++) { 36 | expectedNumPerPartition = Math.max(kiritoDB.commitLogs[i].getFileLength(), expectedNumPerPartition); 37 | } 38 | if (expectedNumPerPartition < 64000) { 39 | windowsNum = 4; 40 | } else { 41 | windowsNum = 1; 42 | } 43 | cacheItems = new CacheItem[windowsNum]; 44 | for (int i = 0; i < windowsNum; i++) { 45 | CacheItem cacheItem = new CacheItem(); 46 | if (DirectIOLib.binit) { 47 | cacheItem.buffer = DirectIOUtils.allocateForDirectIO(Constant.directIOLib, expectedNumPerPartition * Constant.VALUE_LENGTH); 48 | } else { 49 | cacheItem.buffer = ByteBuffer.allocateDirect(expectedNumPerPartition * Constant.VALUE_LENGTH); 50 | } 51 | cacheItems[i] = cacheItem; 52 | } 53 | this.commitLogs = kiritoDB.commitLogs; 54 | } 55 | 56 | public void startFetch() { 57 | for (int threadNo = 0; threadNo < windowsNum; threadNo++) { 58 | final int threadPartition = threadNo; 59 | Thread t = new Thread(new Runnable() { 60 | @Override 61 | public void run() { 62 | // bound core 63 | try (final AffinityLock al2 = AffinityLock.acquireLock()) { 64 | for (int i = 0; i < Constant.partitionNum / windowsNum; i++) { 65 | int dbIndex = i * windowsNum + threadPartition; 66 | CacheItem cacheItem; 67 | while (true) { 68 | cacheItem = getCacheItem(dbIndex); 69 | if (cacheItem != null) { 70 | break; 71 | } 72 | } 73 | commitLogs[dbIndex].loadAll(cacheItem.buffer); 74 | cacheItem.ready = true; 75 | while (true) { 76 | if (cacheItem.allReach) { 77 | break; 78 | } 79 | } 80 | release(dbIndex); 81 | } 82 | } catch (IOException e) { 83 | logger.error("threadNo{} load failed", threadPartition, e); 84 | } 85 | } 86 | }); 87 | t.setDaemon(true); 88 | t.start(); 89 | } 90 | } 91 | 92 | /** 93 | * getCacheItem is a very important method that control every visit consumer and fetch producer 94 | * whether to achieve the cacheItem 95 | * @param dbIndex 96 | * @return 97 | */ 98 | public CacheItem getCacheItem(int dbIndex) { 99 | int index = dbIndex % windowsNum; 100 | lock.lock(); 101 | if (cacheItems[index].dbIndex == dbIndex) { 102 | cacheItems[index].useRef++; 103 | if (cacheItems[index].useRef == 64 + 1) { 104 | cacheItems[index].allReach = true; 105 | } 106 | lock.unlock(); 107 | return cacheItems[index]; 108 | } 109 | 110 | if (cacheItems[index].useRef > 0) { 111 | lock.unlock(); 112 | return null; 113 | } 114 | 115 | cacheItems[index].useRef = 1; 116 | cacheItems[index].ready = false; 117 | cacheItems[index].dbIndex = dbIndex; 118 | cacheItems[index].allReach = false; 119 | 120 | lock.unlock(); 121 | return cacheItems[index]; 122 | } 123 | 124 | /** 125 | * release a cacheItem 126 | * @param dbIndex 127 | */ 128 | public void release(int dbIndex) { 129 | int index = dbIndex % windowsNum; 130 | lock.lock(); 131 | cacheItems[index].useRef--; 132 | lock.unlock(); 133 | } 134 | 135 | public void initFetch() { 136 | for (int i = 0; i < windowsNum; i++) { 137 | cacheItems[i].dbIndex = -1; 138 | cacheItems[i].useRef = -1; 139 | cacheItems[i].ready = false; 140 | cacheItems[i].allReach = false; 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/range/LocalVisitor.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.range; 2 | 3 | import com.alibabacloud.polar_race.engine.common.AbstractVisitor; 4 | import moe.cnkirito.kiritodb.common.Util; 5 | import moe.cnkirito.kiritodb.partition.HighTenPartitioner; 6 | 7 | import java.util.concurrent.atomic.AtomicInteger; 8 | 9 | public class LocalVisitor extends AbstractVisitor { 10 | 11 | private byte[] beforeKey = null; 12 | static AtomicInteger atomicInteger = new AtomicInteger(0); 13 | 14 | @Override 15 | public void visit(byte[] key, byte[] value) { 16 | if (beforeKey != null) { 17 | int result = LocalVisitor.compareByteArrays(beforeKey, key); 18 | if (result > 0) { 19 | HighTenPartitioner highTenPartitioner = new HighTenPartitioner(); 20 | System.out.println(atomicInteger.getAndIncrement() + "check range correct error, key is not in order,partition" + highTenPartitioner.getPartition(beforeKey) + " beforeKey=" + Util.bytes2Long(beforeKey) + ",partition " + highTenPartitioner.getPartition(key) + " key=" + Util.bytes2Long(key)); 21 | } 22 | } 23 | beforeKey = key; 24 | } 25 | 26 | public static int compareByteArrays(byte[] source, byte[] other) { 27 | int length = Math.min(source.length, other.length); 28 | for (int i = 0; i < length; i++) { 29 | int sourceByte = source[i] & 0xff; 30 | int otherType = other[i] & 0xff; 31 | if (sourceByte != otherType) { 32 | return sourceByte - otherType; 33 | } 34 | } 35 | return source.length - other.length; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /engine_java/src/main/java/moe/cnkirito/kiritodb/range/RangeTask.java: -------------------------------------------------------------------------------- 1 | package moe.cnkirito.kiritodb.range; 2 | 3 | import com.alibabacloud.polar_race.engine.common.AbstractVisitor; 4 | 5 | import java.util.concurrent.CountDownLatch; 6 | 7 | public class RangeTask { 8 | private AbstractVisitor abstractVisitor; 9 | private CountDownLatch countDownLatch; 10 | 11 | public RangeTask(AbstractVisitor abstractVisitor, CountDownLatch countDownLatch) { 12 | this.abstractVisitor = abstractVisitor; 13 | this.countDownLatch = countDownLatch; 14 | } 15 | 16 | public AbstractVisitor getAbstractVisitor() { 17 | return abstractVisitor; 18 | } 19 | 20 | public void setAbstractVisitor(AbstractVisitor abstractVisitor) { 21 | this.abstractVisitor = abstractVisitor; 22 | } 23 | 24 | public CountDownLatch getCountDownLatch() { 25 | return countDownLatch; 26 | } 27 | 28 | public void setCountDownLatch(CountDownLatch countDownLatch) { 29 | this.countDownLatch = countDownLatch; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /engine_java/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO, A1 2 | 3 | # A1 is set to be a ConsoleAppender. 4 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 5 | 6 | # A1 uses PatternLayout. 7 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n -------------------------------------------------------------------------------- /engine_race/Makefile: -------------------------------------------------------------------------------- 1 | CLEAN_FILES = # deliberately empty, so we can append below. 2 | CXX=g++ 3 | PLATFORM_LDFLAGS= -lpthread -lrt 4 | PLATFORM_CXXFLAGS= -std=c++11 5 | PROFILING_FLAGS=-pg 6 | OPT= 7 | LDFLAGS += -Wl,-rpath=$(RPATH) 8 | 9 | # DEBUG_LEVEL can have two values: 10 | # * DEBUG_LEVEL=2; this is the ultimate debug mode. It will compile benchmark 11 | # without any optimizations. To compile with level 2, issue `make dbg` 12 | # * DEBUG_LEVEL=0; this is the debug level we use for release. If you're 13 | # running benchmark in production you most definitely want to compile benchmark 14 | # with debug level 0. To compile with level 0, run `make`, 15 | 16 | # Set the default DEBUG_LEVEL to 0 17 | DEBUG_LEVEL?=0 18 | 19 | ifeq ($(MAKECMDGOALS),dbg) 20 | DEBUG_LEVEL=2 21 | endif 22 | 23 | # compile with -O2 if debug level is not 2 24 | ifneq ($(DEBUG_LEVEL), 2) 25 | OPT += -O2 -fno-omit-frame-pointer 26 | # if we're compiling for release, compile without debug code (-DNDEBUG) and 27 | # don't treat warnings as errors 28 | OPT += -DNDEBUG 29 | DISABLE_WARNING_AS_ERROR=1 30 | # Skip for archs that don't support -momit-leaf-frame-pointer 31 | ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1)) 32 | OPT += -momit-leaf-frame-pointer 33 | endif 34 | else 35 | $(warning Warning: Compiling in debug mode. Don't use the resulting binary in production) 36 | OPT += $(PROFILING_FLAGS) 37 | DEBUG_SUFFIX = "_debug" 38 | endif 39 | 40 | # ---------------------------------------------- 41 | SRC_PATH = $(CURDIR) 42 | 43 | # ----------------Dependences------------------- 44 | 45 | INCLUDE_PATH = -I./ 46 | 47 | # ---------------End Dependences---------------- 48 | 49 | LIB_SOURCES := $(wildcard $(SRC_PATH)/*.cc) 50 | 51 | #----------------------------------------------- 52 | 53 | AM_DEFAULT_VERBOSITY = 0 54 | 55 | AM_V_GEN = $(am__v_GEN_$(V)) 56 | am__v_GEN_ = $(am__v_GEN_$(AM_DEFAULT_VERBOSITY)) 57 | am__v_GEN_0 = @echo " GEN " $(notdir $@); 58 | am__v_GEN_1 = 59 | AM_V_at = $(am__v_at_$(V)) 60 | am__v_at_ = $(am__v_at_$(AM_DEFAULT_VERBOSITY)) 61 | am__v_at_0 = @ 62 | am__v_at_1 = 63 | 64 | AM_V_CC = $(am__v_CC_$(V)) 65 | am__v_CC_ = $(am__v_CC_$(AM_DEFAULT_VERBOSITY)) 66 | am__v_CC_0 = @echo " CC " $(notdir $@); 67 | am__v_CC_1 = 68 | CCLD = $(CC) 69 | LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ 70 | AM_V_CCLD = $(am__v_CCLD_$(V)) 71 | am__v_CCLD_ = $(am__v_CCLD_$(AM_DEFAULT_VERBOSITY)) 72 | am__v_CCLD_0 = @echo " CCLD " $(notdir $@); 73 | am__v_CCLD_1 = 74 | 75 | AM_LINK = $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) 76 | 77 | CXXFLAGS += -g 78 | 79 | # This (the first rule) must depend on "all". 80 | default: all 81 | 82 | WARNING_FLAGS = -W -Wextra -Wall -Wsign-compare \ 83 | -Wno-unused-parameter -Woverloaded-virtual \ 84 | -Wnon-virtual-dtor -Wno-missing-field-initializers 85 | 86 | ifndef DISABLE_WARNING_AS_ERROR 87 | WARNING_FLAGS += -Werror 88 | endif 89 | 90 | CXXFLAGS += $(WARNING_FLAGS) $(INCLUDE_PATH) $(PLATFORM_CXXFLAGS) $(OPT) 91 | 92 | LDFLAGS += $(PLATFORM_LDFLAGS) 93 | 94 | LIBOBJECTS = $(LIB_SOURCES:.cc=.o) 95 | # if user didn't config LIBNAME, set the default 96 | ifeq ($(LIBNAME),) 97 | # we should only run benchmark in production with DEBUG_LEVEL 0 98 | LIBNAME=libengine$(DEBUG_SUFFIX) 99 | endif 100 | 101 | ifeq ($(LIBOUTPUT),) 102 | LIBOUTPUT=$(CURDIR)/lib 103 | endif 104 | 105 | ifeq ($(EXEC_DIR),) 106 | EXEC_DIR=$(CURDIR) 107 | endif 108 | 109 | dummy := $(shell mkdir -p $(LIBOUTPUT)) 110 | LIBRARY = $(LIBOUTPUT)/${LIBNAME}.a 111 | INCLUDE_PATH += -I$(EXEC_DIR) 112 | 113 | .PHONY: clean dbg all 114 | 115 | %.o: %.cc 116 | $(AM_V_CC)$(CXX) $(CXXFLAGS) -c $< -o $@ 117 | 118 | all: $(LIBRARY) 119 | 120 | dbg: $(LIBRARY) 121 | 122 | $(LIBRARY): $(LIBOBJECTS) 123 | $(AM_V_at)rm -f $@ 124 | $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) 125 | 126 | clean: 127 | rm -f $(LIBRARY) 128 | rm -rf $(CLEAN_FILES) 129 | rm -rf $(LIBOUTPUT) 130 | find $(SRC_PATH) -maxdepth 1 -name "*.[oda]*" -exec rm -f {} \; 131 | find $(SRC_PATH) -maxdepth 1 -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; 132 | -------------------------------------------------------------------------------- /engine_race/engine_race.cc: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #include "engine_race.h" 3 | 4 | namespace polar_race { 5 | 6 | RetCode Engine::Open(const std::string& name, Engine** eptr) { 7 | return EngineRace::Open(name, eptr); 8 | } 9 | 10 | Engine::~Engine() { 11 | } 12 | 13 | /* 14 | * Complete the functions below to implement you own engine 15 | */ 16 | 17 | // 1. Open engine 18 | RetCode EngineRace::Open(const std::string& name, Engine** eptr) { 19 | *eptr = NULL; 20 | EngineRace *engine_race = new EngineRace(name); 21 | 22 | *eptr = engine_race; 23 | return kSucc; 24 | } 25 | 26 | // 2. Close engine 27 | EngineRace::~EngineRace() { 28 | } 29 | 30 | // 3. Write a key-value pair into engine 31 | RetCode EngineRace::Write(const PolarString& key, const PolarString& value) { 32 | return kSucc; 33 | } 34 | 35 | // 4. Read value of a key 36 | RetCode EngineRace::Read(const PolarString& key, std::string* value) { 37 | return kSucc; 38 | } 39 | 40 | /* 41 | * NOTICE: Implement 'Range' in quarter-final, 42 | * you can skip it in preliminary. 43 | */ 44 | // 5. Applies the given Vistor::Visit function to the result 45 | // of every key-value pair in the key range [first, last), 46 | // in order 47 | // lower=="" is treated as a key before all keys in the database. 48 | // upper=="" is treated as a key after all keys in the database. 49 | // Therefore the following call will traverse the entire database: 50 | // Range("", "", visitor) 51 | RetCode EngineRace::Range(const PolarString& lower, const PolarString& upper, 52 | Visitor &visitor) { 53 | return kSucc; 54 | } 55 | 56 | } // namespace polar_race 57 | -------------------------------------------------------------------------------- /engine_race/engine_race.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef ENGINE_RACE_ENGINE_RACE_H_ 3 | #define ENGINE_RACE_ENGINE_RACE_H_ 4 | #include 5 | #include "include/engine.h" 6 | 7 | namespace polar_race { 8 | 9 | class EngineRace : public Engine { 10 | public: 11 | static RetCode Open(const std::string& name, Engine** eptr); 12 | 13 | explicit EngineRace(const std::string& dir) { 14 | } 15 | 16 | ~EngineRace(); 17 | 18 | RetCode Write(const PolarString& key, 19 | const PolarString& value) override; 20 | 21 | RetCode Read(const PolarString& key, 22 | std::string* value) override; 23 | 24 | /* 25 | * NOTICE: Implement 'Range' in quarter-final, 26 | * you can skip it in preliminary. 27 | */ 28 | RetCode Range(const PolarString& lower, 29 | const PolarString& upper, 30 | Visitor &visitor) override; 31 | 32 | private: 33 | 34 | }; 35 | 36 | } // namespace polar_race 37 | 38 | #endif // ENGINE_RACE_ENGINE_RACE_H_ 39 | -------------------------------------------------------------------------------- /include/engine.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef INCLUDE_ENGINE_H_ 3 | #define INCLUDE_ENGINE_H_ 4 | #include 5 | #include "polar_string.h" 6 | 7 | namespace polar_race { 8 | 9 | enum RetCode { 10 | kSucc = 0, 11 | kNotFound = 1, 12 | kCorruption = 2, 13 | kNotSupported = 3, 14 | kInvalidArgument = 4, 15 | kIOError = 5, 16 | kIncomplete = 6, 17 | kTimedOut = 7, 18 | kFull = 8, 19 | kOutOfMemory = 9, 20 | }; 21 | 22 | // Pass to Engine::Range for callback 23 | class Visitor { 24 | public: 25 | virtual ~Visitor() {} 26 | 27 | virtual void Visit(const PolarString &key, const PolarString &value) = 0; 28 | }; 29 | 30 | class Engine { 31 | public: 32 | // Open engine 33 | static RetCode Open(const std::string& name, 34 | Engine** eptr); 35 | 36 | Engine() { } 37 | 38 | // Close engine 39 | virtual ~Engine(); 40 | 41 | // Write a key-value pair into engine 42 | virtual RetCode Write(const PolarString& key, 43 | const PolarString& value) = 0; 44 | 45 | // Read value of a key 46 | virtual RetCode Read(const PolarString& key, 47 | std::string* value) = 0; 48 | 49 | 50 | /* 51 | * NOTICE: Implement 'Range' in quarter-final, 52 | * you can skip it in preliminary. 53 | */ 54 | // Applies the given Vistor::Visit function to the result 55 | // of every key-value pair in the key range [first, last), 56 | // in order 57 | // lower=="" is treated as a key before all keys in the database. 58 | // upper=="" is treated as a key after all keys in the database. 59 | // Therefore the following call will traverse the entire database: 60 | // Range("", "", visitor) 61 | virtual RetCode Range(const PolarString& lower, 62 | const PolarString& upper, 63 | Visitor &visitor) = 0; 64 | }; 65 | 66 | } // namespace polar_race 67 | 68 | #endif // INCLUDE_ENGINE_H_ 69 | -------------------------------------------------------------------------------- /include/polar_string.h: -------------------------------------------------------------------------------- 1 | // Copyright [2018] Alibaba Cloud All rights reserved 2 | #ifndef INCLUDE_POLAR_STRING_H_ 3 | #define INCLUDE_POLAR_STRING_H_ 4 | 5 | #include 6 | #include 7 | 8 | 9 | namespace polar_race { 10 | 11 | class PolarString { 12 | public: 13 | PolarString() : data_(""), size_(0) { } 14 | 15 | PolarString(const char* d, size_t n) : data_(d), size_(n) { } 16 | 17 | PolarString(const std::string& s) : data_(s.data()), size_(s.size()) { } 18 | 19 | PolarString(const char* s) : data_(s), size_(strlen(s)) { } 20 | 21 | const char* data() const { return data_; } 22 | 23 | size_t size() const { return size_; } 24 | 25 | bool empty() const { return size_ == 0; } 26 | 27 | char operator[](size_t n) const { 28 | return data_[n]; 29 | } 30 | 31 | void clear() { data_ = ""; size_ = 0; } 32 | 33 | 34 | std::string ToString() const; 35 | 36 | // Three-way comparison. Returns value: 37 | // < 0 iff "*this" < "b", 38 | // == 0 iff "*this" == "b", 39 | // > 0 iff "*this" > "b" 40 | int compare(const PolarString& b) const; 41 | 42 | bool starts_with(const PolarString& x) const { 43 | return ((size_ >= x.size_) && 44 | (memcmp(data_, x.data_, x.size_) == 0)); 45 | } 46 | 47 | bool ends_with(const PolarString& x) const { 48 | return ((size_ >= x.size_) && 49 | (memcmp(data_ + size_ - x.size_, x.data_, x.size_) == 0)); 50 | } 51 | 52 | private: 53 | const char* data_; 54 | size_t size_; 55 | // Intentionally copyable 56 | }; 57 | 58 | inline bool operator==(const PolarString& x, const PolarString& y) { 59 | return ((x.size() == y.size()) && 60 | (memcmp(x.data(), y.data(), x.size()) == 0)); 61 | } 62 | 63 | inline bool operator!=(const PolarString& x, const PolarString& y) { 64 | return !(x == y); 65 | } 66 | 67 | inline std::string PolarString::ToString() const { 68 | std::string result; 69 | result.assign(data_, size_); 70 | return result; 71 | } 72 | 73 | inline int PolarString::compare(const PolarString& b) const { 74 | const size_t min_len = (size_ < b.size_) ? size_ : b.size_; 75 | int r = memcmp(data_, b.data_, min_len); 76 | if (r == 0) { 77 | if (size_ < b.size_) r = -1; 78 | else if (size_ > b.size_) r = +1; 79 | } 80 | return r; 81 | } 82 | 83 | 84 | } // namespace polar_race 85 | 86 | #endif // INCLUDE_POLAR_STRING_H_ 87 | -------------------------------------------------------------------------------- /test/README: -------------------------------------------------------------------------------- 1 | g++ -std=c++11 -o test -g -I.. test.cc -L../lib -lengine -lpthread -lrt -lz 2 | -------------------------------------------------------------------------------- /test/test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "include/engine.h" 5 | 6 | static const char kEnginePath[] = "/tmp/test_engine"; 7 | static const char kDumpPath[] = "/tmp/test_dump"; 8 | 9 | using namespace polar_race; 10 | 11 | class DumpVisitor : public Visitor { 12 | public: 13 | DumpVisitor(int* kcnt) 14 | : key_cnt_(kcnt) {} 15 | 16 | ~DumpVisitor() {} 17 | 18 | void Visit(const PolarString& key, const PolarString& value) { 19 | printf("Visit %s --> %s\n", key.data(), value.data()); 20 | (*key_cnt_)++; 21 | } 22 | 23 | private: 24 | int* key_cnt_; 25 | }; 26 | 27 | int main() { 28 | Engine *engine = NULL; 29 | 30 | RetCode ret = Engine::Open(kEnginePath, &engine); 31 | assert (ret == kSucc); 32 | 33 | 34 | ret = engine->Write("aaa", "aaaaaaaaaaa"); 35 | assert (ret == kSucc); 36 | ret = engine->Write("aaa", "111111111111111111111111111111111111111111"); 37 | ret = engine->Write("aaa", "2222222"); 38 | ret = engine->Write("aaa", "33333333333333333333"); 39 | ret = engine->Write("aaa", "4"); 40 | 41 | ret = engine->Write("bbb", "bbbbbbbbbbbb"); 42 | assert (ret == kSucc); 43 | 44 | ret = engine->Write("ccd", "cbbbbbbbbbbbb"); 45 | std::string value; 46 | ret = engine->Read("aaa", &value); 47 | printf("Read aaa value: %s\n", value.c_str()); 48 | 49 | ret = engine->Read("bbb", &value); 50 | assert (ret == kSucc); 51 | printf("Read bbb value: %s\n", value.c_str()); 52 | 53 | int key_cnt = 0; 54 | DumpVisitor vistor(&key_cnt); 55 | ret = engine->Range("b", "", vistor); 56 | assert (ret == kSucc); 57 | printf("Range key cnt: %d\n", key_cnt); 58 | 59 | 60 | delete engine; 61 | 62 | return 0; 63 | } 64 | --------------------------------------------------------------------------------