├── .gitignore ├── BTREE.txt ├── COPYING ├── Makefile ├── README ├── TODO ├── btree.c ├── btree.h └── btree_example.c /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | btree_example 3 | *.db 4 | *.dSYM 5 | -------------------------------------------------------------------------------- /BTREE.txt: -------------------------------------------------------------------------------- 1 | FORMAT OF INTEGERS 2 | ================== 3 | 4 | In the following document all the integers are intended to be in network byte 5 | order. All the sizes are stored as 32 bit unsigned integers, while all the 6 | pointers are stored as 64 bit unsigned integers. 7 | 8 | So the maximum data size, and in general the maximum number of items, is 9 | 2^32-1 while the maximum size of the database file is 2^64-1 bytes. 10 | 11 | LAYOUT 12 | ====== 13 | 14 | +------------------------------------------+ 15 | | HEADER | 16 | +------------------------------------------+ 17 | | 8 bytes freelist | 18 | +------------------------------------------+ 19 | | 16 bytes freelist | 20 | +------------------------------------------+ 21 | |... more power of two freelists ... | 22 | +------------------------------------------+ 23 | | 2GB bytes freelist | 24 | +------------------------------------------+ 25 | | ROOT node offset | 26 | +------------------------------------------+ 27 | / / 28 | / all the other nodes and data / 29 | +------------------------------------------+ 30 | 31 | Freelists are available for sizes for 8, 16, 32, 64, 128, 256, 512, 1024 32 | 2048, 4096, 8192, 16384, 32768, ..., 2GB, for a total of 29 free lists. 33 | 34 | ALIGNMENT REQUIREMENTS 35 | ====================== 36 | 37 | Every pointer on disk is aligned to 8 byte boundary, so that if the block is 38 | a multiple of 8 (512 and 4096 are for example) there are more consistency 39 | guarantees when writing a single pointer on disk for atomic updates. 40 | 41 | In order to ensure this property we simply do the following: 42 | new blocks of data are always allocated in multiple of 8. 43 | Nodes, freelists, and all the structures we write inside data blocks are 44 | always contain 8 bytes aligned pointers. 45 | 46 | HEADER 47 | ====== 48 | 49 | +--------+--------+--------+--------+ 50 | | magic |version | free |freeoff | 51 | +--------+--------+--------+--------+ 52 | 53 | The magic is the 64 bit string "REDBTREE" 54 | 55 | The version field is the version of the btree, as an ascii string: "00000000". 56 | 57 | The freeoff is a pointer to the first byte of the file that is not used, so 58 | it can be used for allocations. When there is the need to allocate more space 59 | then available, the file size is enlarged using truncate(2). 60 | 61 | the free field is the amount of free space starting at freeoff. 64 bit. 62 | 63 | The file is always enlarged by at least BTREE_PREALLOC_SIZE that is a power 64 | of two. 65 | 66 | FREELIST BLOCK 67 | ============== 68 | 69 | +--------+--------+--------+--------+ 70 | | prev | next |numitems| item1 | 71 | +-----------------------------------+ 72 | / / 73 | / more items / 74 | / / 75 | +--------+--------------------------+ 76 | | item N | 77 | +--------+ 78 | 79 | Every free list block contains BTREE_FREELIST_BLOCK_ITEMS items. 80 | 81 | 'next' is the offset of the next free list block for the same size. If the 82 | block is the last one, next is set to the value of zero. 83 | 84 | 'prev' is the offset of the previous free list block for the same size. If the 85 | block is the first one, prev is set to the value of zero. 86 | 87 | 'numitems' is the number of used items in this freelist block. 88 | Since this is a count can't go over 32 bits but it is written as 64 bit on disk so that the free list block is just a sequence of N 64 bit numbers. 89 | 90 | Every item is just a pointer to some place of the file. 91 | 92 | Implementations should try to take freelist pointers in memory, for 93 | instance, for every size: 94 | 95 | - Offsets for all the freelist blocks 96 | - Number of elements of the last block (all the other blocks are full) 97 | 98 | ALLOCATION 99 | ========== 100 | 101 | The btree allocates memory in pieces of power of two sizes. For instance if 102 | there is to create a new node, BTREE_ALLOC is called, with the size of the 103 | node. If the amount to allocate is not already a power of two, the allocator 104 | will return a block that is exactly like the nearest power of two that is 105 | greater than the specified size. 106 | 107 | Data is allocated by looking at the free list for that size. If there is an 108 | available block to reuse, it is removed from the free list and reused 109 | (just updating the number of items in the free list block, or alternatively 110 | removing the block and the link from the previous block if this was the 111 | latest item in the current free list block). 112 | 113 | If there is no space to reuse for the allocation, we check if there is 114 | data at the end of the file that is ready to be used. This is done simply 115 | checking at the difference between the 'totlen' and 'freeoff' fields in the 116 | header. If there is not enough space a truncate(2) operation is performed 117 | against the file to allocate more space at the end. 118 | 119 | Every time BTREE_ALLOC performs an allocation, the returned block is 120 | prefixed with a byte reporting the size of the allocated block. Since 121 | allocations are always power of two, this is just the esponent so that 122 | 2 raised to it will provide the allocation size. 123 | 124 | So actually if we call BTREE_ALLOC(4), this is what happens: 125 | 126 | * one byte is added, so BTREE_ALLOC will really try to allocate 5 bytes. 127 | * the nearest power of two greater than 5 is 8. 128 | * the freelist for 8 bytes is checked. If there is an item it is reused. 129 | (note that we don't need to write the size prefix when we reuse an 130 | item, there should already be the right number in the header). 131 | * if there is no free space, more free space is created at the end, of 132 | at least BTREE_PREALLOC_SIZE, or more if the allocation needed more. 133 | * The one byte header is written as first byte, an the pointer to the 134 | next byte is returned. 135 | 136 | RELEASING AN ALLOCATION 137 | ======================= 138 | 139 | BTREE_FREE does the contrary, releasing an allocation so that the space 140 | will be available for further allocations. 141 | 142 | What happens is simply that that the pointer to the released allocation is 143 | put into the corresponding free list. 144 | 145 | As you can see the size of the btree file will never get smaller, as even 146 | when memory is released we take it pre-allocated into free lists. 147 | A tool will be provided for off line compaction of databases that for some 148 | reason need to be restored to the minimum size, for instance for backups 149 | or WAN transfers. 150 | 151 | BTREE NODE 152 | ========== 153 | 154 | The B-tree node is composed of: 155 | 156 | N keys 157 | N values pointers 158 | N+1 pointers to child nodes 159 | 160 | Every node can have from BTREE_MIN_KEYS to BTREE_MAX_KEYS keys. 161 | 162 | All the keys are the same size of 16 bytes, that is, the first 16 bytes of 163 | the SHA1 sum of the real key if big keys support is enabled. 164 | 165 | Keys may also be 128 bit numbers when the btree is used as an index, or 166 | fixed length 16 bytes keys, possible zero padded at the end if the key 167 | is not binary safe but may be shorter. 168 | 169 | +--------+--------+--------+--------+ 170 | | start |numkeys | isleaf | notused| 171 | +--------+--------+--------+--------+ 172 | | key1 | 173 | +--------+--------+--------+--------+ 174 | | key2 | 175 | +--------+--------+--------+--------+ 176 | | ... all the other keys .. | 177 | +--------+--------+--------+--------+ 178 | | value pointer 1 | value pointer 2 | 179 | +--------+--------+--------+--------+ 180 | | ... N value pointers in total ... | 181 | +-----------------------------------+ 182 | | child pointer 1 | child pointer 2 | 183 | +--------+--------+--------+--------+ 184 | | .. N+1 child pointers in total .. | 185 | +--------+--------------------------+ 186 | | end | 187 | +--------+ 188 | 189 | start is just a random 32 bit number. The same random number is also written 190 | in the end field. This is used in order to detect corruptions (half written 191 | noes). 192 | 193 | numkeys is the number of keys in this node, and is a count (32 bit integer). 194 | 195 | isleaf is set to 0 if the node is not a leaf, otherwise to 1. (32 bit integer) 196 | 197 | unused is a not used field, 32 bit. It was added in order to make sure all the 198 | pointers are 8 bytes aligned, may be used in future versions of the btree. 199 | 200 | key1 ... keyN are the fixed length keys, 16 bytes. If support for big keys 201 | is enabled, this is the SHA1 of the key, truncated at 16 bytes. 202 | 203 | all the pointers are simply 64 bit unsigend offsets. 204 | 205 | All nodes are allocated with space for the maximum number of keys, so for 206 | instance if BTREE_MAX_KEYS is 255, every node will be: 207 | 208 | 4 + 4 + 4 + 4 + 16*255 + 8*255 + 8*256 + 4 bytes = 8188 bytes. 209 | 210 | It is important to select BTREE_MAX_KEYS so that it is just a little smaller 211 | than a power of two. In this case 8188 is just a bit smaller than 8192. 212 | 213 | REDIS LEVEL OPERATIONS 214 | ====================== 215 | 216 | DISKSTORE LOCK and DISKSTORE UNLOCK commands may be implemented in order to 217 | tell Redis to hold new changes in memory and don't write new things on 218 | diskstore, exactly like what happens when a BGSAVE is in progress with diskstore 219 | enabled. This is useful in order to allow the system administrator to copy 220 | the B-TREE while Redis is running. 221 | 222 | FREE LIST HANDLING 223 | ================== 224 | 225 | Let's assume that: 226 | 227 | 1) The free list block of our btree is 512 bytes. 228 | 2) We receive an btree_alloc() free request against another block of 512 bytes. 229 | 3) The latest free list for size 512 bytes is all full: 230 | 231 | [UUUUU ... UUUUU] (U means "used") 232 | 233 | Since we received a btree_alloc we need to put a new item inside the free list 234 | block, but since it is full we need to allocate a new 512 bytes block. 235 | Allocating this block would result in the current block to have a free item! 236 | So after we link the new block we'll obtain: 237 | 238 | [UUUUU ... UUUU ] -> [ ] 239 | 240 | That's not correct as the previous block now has a free item. 241 | 242 | So what we do instead is to use the block that we should put into the free list 243 | as a new block for the free list. So the final result is: 244 | 245 | [UUUUU ... UUUUU] -> [ ] 246 | 247 | That is what we want. 248 | 249 | On the contrary, if we want to allocaet a block 512 bytes in size, and there 250 | is an empty block at the tail of our free list, we just use the block itself. 251 | 252 | RANDOM IDEAS / TODO 253 | =================== 254 | 255 | 1) Take a global incremental 64 bit counter and set it always both at the start and at the end of a node, so that it is possible to detect half written blocks. 256 | 2) Perhaps it is better to return allocations always zeroed, even when we reuse an entry in the freelist. 257 | 3) Check at startup that the file length is equal to freeoff+free in header. 258 | 259 | USEFUL RESOURCES ON BTREES 260 | ========================== 261 | 262 | - Introduction to Algorithms (Cormen, ...), chapter on B-TREEs. 263 | - http://dr-josiah.blogspot.com/2010/08/databases-on-ssds-initial-ideas-on.html 264 | - http://www.eecs.berkeley.edu/~brewer/cs262/LFS.pdf 265 | 266 | NODE SPLITTING 267 | ============== 268 | 269 | x: c h 270 | / | \ 271 | / | \ 272 | a | z zz zzz 273 | | 274 | y: d e ee 275 | 276 | 277 | i = 1 278 | ----- 279 | 280 | x: c e h 281 | / | | \ 282 | / | | \ 283 | a | | z zz zzz 284 | | | 285 | y: d ee 286 | 287 | 288 | k[i+1] = k[i], per numkeys-i = (2 - 1) = 1 289 | c[i+2] = c[i+1], per numkeys-i = (2 - 1) = 1 290 | k[i] = e 291 | c[i] = left 292 | c[i+1] = right 293 | 294 | i = 2 295 | ----- 296 | 297 | x: c h zz 298 | / | \ \ 299 | / | \ \ 300 | a | z zzz 301 | | 302 | y: d e ee 303 | 304 | k[3] = k[2], per numkeys-i = (2-2) = 0 305 | c[4] = k[3], per numkeys-i = (2-2) = 0 306 | k[2] = zz 307 | c[i] = left 308 | c[i+1] = right 309 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011, Salvatore Sanfilippo 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | * Neither the name of Redis nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: btree-example 2 | 3 | btree-example: btree.c btree_example.c 4 | $(CC) -o btree_example btree.c btree_example.c -Wall -W -g -rdynamic -ggdb -O2 5 | 6 | clean: 7 | rm -rf btree_example btree_example.dSYM 8 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | There are a number of btree (and variants) implementations around, but for 2 | many reasons like complexity and tight coupling with external code bases, or 3 | licensing issues, to find an easy to embed implementation of an on disk btree 4 | is not an easy task. 5 | 6 | Since an on disk btree is a tool useful in a number of software projects this 7 | library is an attempt to bring to the table an open implementation of a btree. 8 | The term "open" here means: simple to use, BSD licensed, well documented, and 9 | simple to modify and understand. 10 | 11 | CURRENT STATUS 12 | ============== 13 | 14 | Currently this is a work in progress, so far we have a subset of basic btree 15 | operations implemented on top of an on disk allocator (something like a 16 | file-based malloc). 17 | 18 | Supported operations are adding new keys, splitting of nodes. 19 | Deletion is not supported, nor update of old values. 20 | In other words the project is NOT usable so far, more work is needed. 21 | 22 | Currently everything is written on disk on every write for the sake of 23 | simplicity of the first implementation, but the work is in progress in order 24 | to cache the allocator metadata in memory, so that the performances can be 25 | enhanced. 26 | 27 | The goal is to eventually support all the following features: 28 | 29 | - A compromise between fast implementation and ability to incrementally reclaim 30 | memory from disk automatically. This is why we have the on disk allocator. 31 | - Range queries using 128 bit precision integers, to use the btree as index. 32 | - Good tools for recovering and checking the btree. 33 | - Good documentation. 34 | 35 | Perhaps in the future: 36 | 37 | - Optional append only mode with compaction, for higher corruption resistance. 38 | 39 | In the first stage of the project the goal is to be good enough for the Redis 40 | project (in order to use this library for the diskstore feature of Redis). 41 | However while trying to reach this goal every care will be used in order to 42 | retain a great level of generality of this lib that will continue to live as a 43 | stand alone library. Redis will just happen to use a copy of it. 44 | 45 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | There are tons of things to do as the lib is currently a work in progress 2 | and not ready to be used. So this is just a list of random things that must 3 | be done soon or later. 4 | 5 | - In memory freelist. So there is no disk access for allocation/free, simply the btree will leak some memory on disk when the application does not propetly close it. 6 | - crc32 in btree values. In the current allocation header we use a 64 bit length filed that is too much as our max allocation is 2GB. We needed the 8 byte header in order to preserve alignment. But we can use four of this bytes for crc32 purposes. This way the btree-check utility can validate values in a data agnostic way. 7 | - The btree-check utility should be able to rewrite the freelists. It can simply create an in-memory bitmap representing every 8 byte block of the btree. Then walk the whole btree, flipping every used 8 byte block to 1. At the end we can do a one-pass scan on the bitmap to populate all the free lists. 8 | -------------------------------------------------------------------------------- /btree.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011, Salvatore Sanfilippo 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of Redis nor the names of its contributors may be used 14 | * to endorse or promote products derived from this software without 15 | * specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | * POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include "btree.h" 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | int btree_create(struct btree *bt); 39 | int btree_read_metadata(struct btree *bt); 40 | struct btree_node *btree_create_node(void); 41 | void btree_free_node(struct btree_node *n); 42 | int btree_write_node(struct btree *bt, struct btree_node *n, uint64_t offset); 43 | int btree_freelist_index_by_exp(int exponent); 44 | int btree_split_child(struct btree *bt, uint64_t pointedby, uint64_t parentoff, 45 | int i, uint64_t childoff, uint64_t *newparent); 46 | 47 | /* ------------------------ UNIX standard VFS Layer ------------------------- */ 48 | #include 49 | #include 50 | #include 51 | 52 | void *bvfs_unistd_open(char* path, int flags) { 53 | int fd; 54 | void *handle; 55 | 56 | fd = open(path,((flags & BTREE_CREAT) ? O_CREAT : 0)|O_RDWR,0644); 57 | if (fd == -1) return NULL; 58 | handle = malloc(sizeof(fd)); 59 | *(int*)handle = fd; 60 | return handle; 61 | } 62 | 63 | void bvfs_unistd_close(void *handle) { 64 | int *fd = handle; 65 | 66 | close(*fd); 67 | free(handle); 68 | } 69 | 70 | ssize_t bvfs_unistd_pread(void *handle, void *buf, uint32_t nbytes, 71 | uint64_t offset) 72 | { 73 | int *fd = handle; 74 | 75 | return pread(*fd,buf,nbytes,offset); 76 | } 77 | 78 | ssize_t bvfs_unistd_pwrite(void *handle, const void *buf, uint32_t nbytes, 79 | uint64_t offset) 80 | { 81 | int *fd = handle; 82 | 83 | return pwrite(*fd,buf,nbytes,offset); 84 | } 85 | 86 | int bvfs_unistd_resize(void *handle, uint64_t length) { 87 | int *fd = handle; 88 | 89 | return ftruncate(*fd,length); 90 | } 91 | 92 | int bvfs_unistd_getsize(void *handle, uint64_t *size) { 93 | int *fd = handle; 94 | struct stat sb; 95 | 96 | if (fstat(*fd,&sb) == -1) return -1; 97 | *size = (uint64_t) sb.st_size; 98 | return 0; 99 | } 100 | 101 | void bvfs_unistd_sync(void *handle) { 102 | int *fd = handle; 103 | 104 | fsync(*fd); 105 | } 106 | 107 | struct btree_vfs bvfs_unistd = { 108 | bvfs_unistd_open, 109 | bvfs_unistd_close, 110 | bvfs_unistd_pread, 111 | bvfs_unistd_pwrite, 112 | bvfs_unistd_resize, 113 | bvfs_unistd_getsize, 114 | bvfs_unistd_sync 115 | }; 116 | 117 | /* ------------------------- From/To Big endian ----------------------------- */ 118 | 119 | void btree_u32_to_big(unsigned char *buf, uint32_t val) { 120 | buf[0] = (val >> 24) & 0xff; 121 | buf[1] = (val >> 16) & 0xff; 122 | buf[2] = (val >> 8) & 0xff; 123 | buf[3] = val & 0xff; 124 | } 125 | 126 | void btree_u64_to_big(unsigned char *buf, uint64_t val) { 127 | buf[0] = (val >> 56) & 0xff; 128 | buf[1] = (val >> 48) & 0xff; 129 | buf[2] = (val >> 40) & 0xff; 130 | buf[3] = (val >> 32) & 0xff; 131 | buf[4] = (val >> 24) & 0xff; 132 | buf[5] = (val >> 16) & 0xff; 133 | buf[6] = (val >> 8) & 0xff; 134 | buf[7] = val & 0xff; 135 | } 136 | 137 | uint32_t btree_u32_from_big(unsigned char *buf) { 138 | uint32_t val = 0; 139 | 140 | val |= buf[0] << 24; 141 | val |= buf[1] << 16; 142 | val |= buf[2] << 8; 143 | val |= buf[3]; 144 | return val; 145 | } 146 | 147 | uint64_t btree_u64_from_big(unsigned char *buf) { 148 | uint64_t val = 0; 149 | 150 | val |= (uint64_t) buf[0] << 56; 151 | val |= (uint64_t) buf[1] << 48; 152 | val |= (uint64_t) buf[2] << 40; 153 | val |= (uint64_t) buf[3] << 32; 154 | val |= (uint64_t) buf[4] << 24; 155 | val |= (uint64_t) buf[5] << 16; 156 | val |= (uint64_t) buf[6] << 8; 157 | val |= (uint64_t) buf[7]; 158 | return val; 159 | } 160 | 161 | /* -------------------------- Utility functions ----------------------------- */ 162 | 163 | /* We read and write too often to write bt->vfs->...(bt->vfs_handle...) all the 164 | * times, so we use this two help functions. */ 165 | ssize_t btree_pwrite(struct btree *bt, const void *buf, uint32_t nbytes, 166 | uint64_t offset) 167 | { 168 | return bt->vfs->pwrite(bt->vfs_handle,buf,nbytes,offset); 169 | } 170 | 171 | ssize_t btree_pread(struct btree *bt, void *buf, uint32_t nbytes, 172 | uint64_t offset) 173 | { 174 | return bt->vfs->pread(bt->vfs_handle,buf,nbytes,offset); 175 | } 176 | 177 | /* We want to be able to write and read 32 and 64 integers easily and in a 178 | * platform / endianess agnostic way. */ 179 | ssize_t btree_pwrite_u32(struct btree *bt, uint32_t val, uint64_t offset) { 180 | unsigned char buf[4]; 181 | 182 | btree_u32_to_big(buf,val); 183 | return btree_pwrite(bt,buf,sizeof(buf),offset); 184 | } 185 | 186 | int btree_pwrite_u64(struct btree *bt, uint64_t val, uint64_t offset) { 187 | unsigned char buf[8]; 188 | 189 | btree_u64_to_big(buf,val); 190 | return btree_pwrite(bt,buf,sizeof(buf),offset); 191 | } 192 | 193 | int btree_pread_u32(struct btree *bt, uint32_t *val, uint64_t offset) { 194 | unsigned char buf[4]; 195 | 196 | if (btree_pread(bt,buf,sizeof(buf),offset) == -1) return -1; 197 | *val = btree_u32_from_big(buf); 198 | return 0; 199 | } 200 | 201 | int btree_pread_u64(struct btree *bt, uint64_t *val, uint64_t offset) { 202 | unsigned char buf[8]; 203 | 204 | if (btree_pread(bt,buf,sizeof(buf),offset) == -1) return -1; 205 | *val = btree_u64_from_big(buf); 206 | return 0; 207 | } 208 | 209 | void btree_sync(struct btree *bt) { 210 | if (bt->flags & BTREE_FLAG_USE_WRITE_BARRIER) 211 | bt->vfs->sync(bt->vfs_handle); 212 | } 213 | 214 | /* ---------------------------- BTREE operations ---------------------------- */ 215 | 216 | void btree_set_flags(struct btree *bt, int flags) { 217 | bt->flags |= flags; 218 | } 219 | 220 | void btree_clear_flags(struct btree *bt, int flags) { 221 | bt->flags &= ~flags; 222 | } 223 | 224 | /* Open a btree. On error NULL is returned, and errno is set accordingly. 225 | * Flags modify the behavior of the call: 226 | * 227 | * BTREE_CREAT: create the btree if it does not exist. */ 228 | struct btree *btree_open(struct btree_vfs *vfs, char *path, int flags) { 229 | struct btree *bt = NULL; 230 | struct timeval tv; 231 | int j, mkroot = 0; 232 | 233 | /* Initialize a new btree structure */ 234 | if ((bt = malloc(sizeof(*bt))) == NULL) { 235 | errno = ENOMEM; 236 | return NULL; 237 | } 238 | bt->vfs = vfs ? vfs : &bvfs_unistd; 239 | bt->vfs_handle = NULL; 240 | bt->flags = BTREE_FLAG_USE_WRITE_BARRIER; 241 | for (j = 0; j < BTREE_FREELIST_COUNT; j++) { 242 | bt->freelist[j].numblocks = 0; 243 | bt->freelist[j].blocks = NULL; 244 | bt->freelist[j].last_items = 0; 245 | } 246 | 247 | /* Try opening the specified btree */ 248 | bt->vfs_handle = bt->vfs->open(path,0); 249 | if (bt->vfs_handle == NULL) { 250 | if (!(flags & BTREE_CREAT)) goto err; 251 | /* Create the btree */ 252 | if ((bt->vfs_handle = bt->vfs->open(path,flags)) == NULL) goto err; 253 | if (btree_create(bt) == -1) goto err; 254 | mkroot = 1; /* Create the root node before returing */ 255 | } 256 | 257 | /* There are things about our btree that we always take in memory, 258 | * like all the free list block pointers and so forth. 259 | * Once we open the btree, we need to load this data into memory. */ 260 | if (btree_read_metadata(bt) == -1) goto err; 261 | gettimeofday(&tv,NULL); 262 | bt->mark = (uint32_t) random() ^ tv.tv_sec ^ tv.tv_usec; 263 | 264 | /* Write the root node if needed (only when DB is created) */ 265 | if (mkroot) { 266 | struct btree_node *root; 267 | uint64_t rootptr; 268 | 269 | /* Allocate space for the root */ 270 | if ((rootptr = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 271 | 272 | /* Create a fresh root node and write it on disk */ 273 | if ((root = btree_create_node()) == NULL) goto err; 274 | root->isleaf = 1; /* Our first node is a leaf */ 275 | if (btree_write_node(bt,root,rootptr) == -1) { 276 | btree_free_node(root); 277 | goto err; 278 | } 279 | btree_free_node(root); 280 | btree_sync(bt); 281 | 282 | /* Write the root node pointer. */ 283 | if (btree_pwrite_u64(bt,rootptr,BTREE_HDR_ROOTPTR_POS) == -1) goto err; 284 | bt->rootptr = rootptr; 285 | btree_sync(bt); 286 | } 287 | return bt; 288 | 289 | err: 290 | btree_close(bt); 291 | return NULL; 292 | } 293 | 294 | /* Close a btree, even one that was unsuccesfull opened, so that 295 | * btree_open() can use this function for cleanup on error. */ 296 | void btree_close(struct btree *bt) { 297 | int j; 298 | 299 | if (!bt) return; 300 | if (bt->vfs_handle) bt->vfs->close(bt->vfs_handle); 301 | for (j = 0; j < BTREE_FREELIST_COUNT; j++) 302 | free(bt->freelist[j].blocks); 303 | free(bt); 304 | } 305 | 306 | #include 307 | 308 | /* Create a new btree, populating the header, free lists. 309 | * Note that this function is not exported, as callers should create a new 310 | * btree using open with the BTREE_CREAT flag. */ 311 | int btree_create(struct btree *bt) { 312 | int size, j; 313 | uint64_t filesize, freeoff; 314 | 315 | /* Make room for all the objects we have in the header */ 316 | if (bt->vfs->getsize(bt->vfs_handle,&filesize) == -1) return -1; 317 | assert(filesize == 0); 318 | 319 | /* header: magic, version, free, freeoff */ 320 | size = 8*4; 321 | /* Then we have our root free lists */ 322 | size += BTREE_FREELIST_COUNT * BTREE_FREELIST_BLOCK_SIZE; 323 | /* And finally our root node pointer and actual node */ 324 | size += 8; /* root pointer */ 325 | size += BTREE_NODE_SIZE; /* root node */ 326 | if (bt->vfs->resize(bt->vfs_handle,size) == -1) return -1; 327 | 328 | /* Now we have enough space to actually build the btree header, 329 | * free lists, and root node. */ 330 | 331 | /* Magic and version */ 332 | if (btree_pwrite(bt,"REDBTREE00000000",16,0) == -1) return -1; 333 | 334 | /* Free and Freeoff */ 335 | if (btree_pwrite_u64(bt,0,BTREE_HDR_FREE_POS) == -1) return -1; 336 | freeoff = 32+BTREE_FREELIST_BLOCK_SIZE*BTREE_FREELIST_COUNT+8+BTREE_NODE_SIZE; 337 | if (btree_pwrite_u64(bt,freeoff,BTREE_HDR_FREEOFF_POS) == -1) return -1; 338 | 339 | /* Free lists */ 340 | for (j = 0; j < BTREE_FREELIST_COUNT; j++) { 341 | uint64_t off = 32+BTREE_FREELIST_BLOCK_SIZE*j; 342 | 343 | /* next and prev pointers are set to zero, as this is the first 344 | * and sole block for this size. */ 345 | if (btree_pwrite_u64(bt,0,off) == -1) return -1; 346 | if (btree_pwrite_u64(bt,0,off+8) == -1) return -1; 347 | /* Set count as zero, as we have no entry inside this block */ 348 | if (btree_pwrite_u32(bt,0,off+16) == -1) return -1; 349 | } 350 | return 0; 351 | } 352 | 353 | int btree_read_metadata(struct btree *bt) { 354 | int j; 355 | 356 | /* TODO: Check signature and version. */ 357 | /* Read free space and offset information */ 358 | if (btree_pread_u64(bt,&bt->free,BTREE_HDR_FREE_POS) == -1) return -1; 359 | if (btree_pread_u64(bt,&bt->freeoff,BTREE_HDR_FREEOFF_POS) == -1) return -1; 360 | /* TODO: check that they makes sense considered the file size. */ 361 | /* Read root node pointer */ 362 | if (btree_pread_u64(bt,&bt->rootptr,BTREE_HDR_ROOTPTR_POS) == -1) return -1; 363 | printf("Root node is at %llu\n", bt->rootptr); 364 | /* Read free lists information */ 365 | for (j = 0; j < BTREE_FREELIST_COUNT; j++) { 366 | uint64_t ptr = 32+BTREE_FREELIST_BLOCK_SIZE*j; 367 | uint64_t nextptr, numitems; 368 | 369 | // printf("Load metadata for freelist %d\n", j); 370 | do { 371 | struct btree_freelist *fl = &bt->freelist[j]; 372 | 373 | if (btree_pread_u64(bt,&nextptr,ptr+sizeof(uint64_t)) == -1) 374 | return -1; 375 | if (btree_pread_u64(bt,&numitems,ptr+sizeof(uint64_t)*2) == -1) 376 | return -1; 377 | // printf(" block %lld: %lld items (next: %lld)\n", ptr, numitems, 378 | // nextptr); 379 | fl->blocks = realloc(fl->blocks,sizeof(uint64_t)*(fl->numblocks+1)); 380 | if (fl->blocks == NULL) return -1; 381 | fl->blocks[fl->numblocks] = ptr; 382 | fl->numblocks++; 383 | fl->last_items = numitems; 384 | ptr = nextptr; 385 | } while(ptr); 386 | } 387 | return 0; 388 | } 389 | 390 | /* Create a new node in memory */ 391 | struct btree_node *btree_create_node(void) { 392 | struct btree_node *n = calloc(1,sizeof(*n)); 393 | 394 | return n; 395 | } 396 | 397 | void btree_free_node(struct btree_node *n) { 398 | free(n); 399 | } 400 | 401 | /* Write a node on disk at the specified offset. Returns 0 on success. 402 | * On error -1 is returne and errno set accordingly. */ 403 | int btree_write_node(struct btree *bt, struct btree_node *n, uint64_t offset) { 404 | unsigned char buf[BTREE_NODE_SIZE]; 405 | unsigned char *p = buf; 406 | int j; 407 | 408 | bt->mark++; 409 | btree_u32_to_big(p,bt->mark); p += 4; /* start mark */ 410 | btree_u32_to_big(p,n->numkeys); p += 4; /* number of keys */ 411 | btree_u32_to_big(p,n->isleaf); p += 4; /* is a leaf? */ 412 | btree_u32_to_big(p,0); p += 4; /* unused field, needed for alignment */ 413 | memcpy(p,n->keys,sizeof(n->keys)); p += sizeof(n->keys); /* keys */ 414 | /* values */ 415 | for (j = 0; j < BTREE_MAX_KEYS; j++) { 416 | btree_u64_to_big(p,n->values[j]); 417 | p += 8; 418 | } 419 | /* children */ 420 | for (j = 0; j <= BTREE_MAX_KEYS; j++) { 421 | btree_u64_to_big(p,n->children[j]); 422 | p += 8; 423 | } 424 | btree_u32_to_big(p,bt->mark); p += 4; /* end mark */ 425 | return btree_pwrite(bt,buf,sizeof(buf),offset); 426 | } 427 | 428 | /* Read a node from the specified offset. 429 | * On success the in memory representation of the node is returned as a 430 | * btree_node structure (to be freed with btree_free_node). On error 431 | * NULL is returned and errno set accordingly. 432 | * 433 | * If data on disk is corrupted errno is set to EFAULT. */ 434 | struct btree_node *btree_read_node(struct btree *bt, uint64_t offset) { 435 | unsigned char buf[BTREE_NODE_SIZE], *p; 436 | struct btree_node *n; 437 | int j; 438 | 439 | if (btree_pread(bt,buf,sizeof(buf),offset) == -1) return NULL; 440 | /* Verify start/end marks */ 441 | if (memcmp(buf,buf+BTREE_NODE_SIZE-4,4)) { 442 | errno = EFAULT; 443 | return NULL; 444 | } 445 | if ((n = btree_create_node()) == NULL) return NULL; 446 | 447 | p = buf+4; 448 | n->numkeys = btree_u32_from_big(p); p += 4; /* number of keys */ 449 | n->isleaf = btree_u32_from_big(p); p += 4; /* is a leaf? */ 450 | p += 4; /* unused field, needed for alignment */ 451 | memcpy(n->keys,p,sizeof(n->keys)); p += sizeof(n->keys); /* keys */ 452 | /* values */ 453 | for (j = 0; j < BTREE_MAX_KEYS; j++) { 454 | n->values[j] = btree_u64_from_big(p); 455 | p += 8; 456 | } 457 | /* children */ 458 | for (j = 0; j <= BTREE_MAX_KEYS; j++) { 459 | n->children[j] = btree_u64_from_big(p); 460 | p += 8; 461 | } 462 | return n; 463 | } 464 | 465 | /* ------------------------- disk space allocator --------------------------- */ 466 | 467 | /* Compute logarithm in base two of 'n', with 'n' being a power of two. 468 | * Probably you can just check the latest 1 bit set, but here it's not 469 | * a matter of speed as we are dealing with the disk every time we call 470 | * this function. */ 471 | int btree_log_two(uint32_t n) { 472 | int log = -1; 473 | 474 | while(n) { 475 | log++; 476 | n /= 2; 477 | } 478 | return log; 479 | } 480 | 481 | int btree_alloc_freelist(struct btree *bt, uint32_t realsize, uint64_t *ptr) { 482 | int exp = btree_log_two(realsize); 483 | int fli = btree_freelist_index_by_exp(exp); 484 | struct btree_freelist *fl = &bt->freelist[fli]; 485 | uint64_t block, lastblock = 0, p; 486 | 487 | if (fl->last_items == 0 && fl->numblocks == 1) { 488 | *ptr = 0; 489 | return 0; 490 | } 491 | 492 | /* Last block is empty? Remove it */ 493 | if (fl->last_items == 0) { 494 | uint64_t prevblock, *oldptr; 495 | 496 | assert(fl->numblocks > 1); 497 | /* Set prevblock next pointer to NULL */ 498 | prevblock = fl->blocks[fl->numblocks-2]; 499 | if (btree_pwrite_u64(bt,0,prevblock+sizeof(uint64_t)) == -1) return -1; 500 | btree_sync(bt); 501 | /* Fix our memory representaiton of freelist */ 502 | lastblock = fl->blocks[fl->numblocks-1]; 503 | fl->numblocks--; 504 | /* The previous item must be full, so we set the new number 505 | * of items to the max. */ 506 | fl->last_items = BTREE_FREELIST_BLOCK_ITEMS; 507 | /* Realloc the block as we have one element less. */ 508 | oldptr = fl->blocks; 509 | fl->blocks = realloc(fl->blocks,sizeof(uint64_t)*fl->numblocks); 510 | if (fl->blocks == NULL) { 511 | /* Out of memory. The realloc failed, but note that while this 512 | * is a leak as the block remains larger than needed we still 513 | * have a valid in memory representation. */ 514 | fl->blocks = oldptr; 515 | return -1; 516 | } 517 | } 518 | 519 | /* There was a block to remove, but this block is the same size 520 | * of the allocation required? Just return it. */ 521 | if (lastblock && exp == BTREE_FREELIST_SIZE_EXP) { 522 | *ptr = lastblock; 523 | return 0; 524 | } else { 525 | btree_free(bt,lastblock); 526 | } 527 | 528 | /* Get an element from the current block, and return it to the 529 | * caller. */ 530 | block = fl->blocks[fl->numblocks-1]; 531 | if (btree_pread_u64(bt,&p,block+((2+fl->last_items)*sizeof(uint64_t))) == -1) return -1; 532 | fl->last_items--; 533 | if (btree_pwrite_u64(bt,fl->last_items,block+(2*sizeof(uint64_t))) == -1) return -1; 534 | btree_sync(bt); 535 | *ptr = p+sizeof(uint64_t); 536 | return 0; 537 | } 538 | 539 | /* Return the next power of two that is able to hold size+1 bytes. 540 | * The byte we add is used to save the exponent of two as the first byte 541 | * so that for btree_free() can check the block size. */ 542 | uint32_t btree_alloc_realsize(uint32_t size) { 543 | uint32_t realsize; 544 | 545 | realsize = 16; /* We don't allocate nothing that is smaller than 16 bytes */ 546 | while (realsize < (size+sizeof(uint64_t))) realsize *= 2; 547 | return realsize; 548 | } 549 | 550 | /* Allocate some piece of data on disk. Returns the offset to the newly 551 | * allocated space. If the allocation can't be performed, 0 is returned. */ 552 | uint64_t btree_alloc(struct btree *bt, uint32_t size) { 553 | uint64_t ptr; 554 | uint32_t realsize; 555 | 556 | printf("ALLOCATIING %lu\n", (unsigned long) size); 557 | 558 | /* Don't allow allocations bigger than 2GB */ 559 | if (size > (unsigned)(1<<31)) { 560 | errno = EINVAL; 561 | return 0; 562 | } 563 | realsize = btree_alloc_realsize(size); 564 | 565 | /* Search for free space in the free lists */ 566 | if (btree_alloc_freelist(bt,realsize,&ptr) == -1) return 0; 567 | if (ptr) { 568 | uint64_t oldsize; 569 | /* Got an element from the free list. Fix the size header if needed. */ 570 | if (btree_pread_u64(bt,&oldsize,ptr-sizeof(uint64_t)) == -1) return 0; 571 | if (oldsize != size) { 572 | if (btree_pwrite_u64(bt,size,ptr-sizeof(uint64_t)) == -1) 573 | return 0; 574 | btree_sync(bt); 575 | } 576 | return ptr; 577 | } 578 | 579 | /* We have to perform a real allocation. 580 | * If we don't have room at the end of the file, create some space. */ 581 | if (bt->free < realsize) { 582 | uint64_t currsize = bt->freeoff + bt->free; 583 | if (bt->vfs->resize(bt->vfs_handle,currsize+BTREE_PREALLOC_SIZE) == -1) 584 | return 0; 585 | bt->free += BTREE_PREALLOC_SIZE; 586 | } 587 | 588 | /* Allocate it moving the header pointers and free space count */ 589 | ptr = bt->freeoff; 590 | bt->free -= realsize; 591 | bt->freeoff += realsize; 592 | 593 | if (btree_pwrite_u64(bt,bt->free,BTREE_HDR_FREE_POS) == -1) return -1; 594 | if (btree_pwrite_u64(bt,bt->freeoff,BTREE_HDR_FREEOFF_POS) == -1) return -1; 595 | 596 | /* Write the size header in the new allocated space */ 597 | if (btree_pwrite_u64(bt,size,ptr) == -1) return -1; 598 | 599 | /* A final fsync() as a write barrier */ 600 | btree_sync(bt); 601 | return ptr+sizeof(uint64_t); 602 | } 603 | 604 | /* Given an on disk pointer returns the length of the original allocation 605 | * (not the size of teh chunk itself as power of two, but the original 606 | * argument passed to btree_alloc function). 607 | * 608 | * On success 0 is returned and the size parameter populated, otherwise 609 | * -1 is returned and errno set accordingly. */ 610 | int btree_alloc_size(struct btree *bt, uint32_t *size, uint64_t ptr) { 611 | uint64_t s; 612 | 613 | if (btree_pread_u64(bt,&s,ptr-8) == -1) return -1; 614 | *size = (uint32_t) s; 615 | return 0; 616 | } 617 | 618 | /* Return the free list slot index given the power of two exponent representing 619 | * the size of the free list allocations. */ 620 | int btree_freelist_index_by_exp(int exponent) { 621 | assert(exponent > 1 && exponent < 32); 622 | return exponent-4; 623 | } 624 | 625 | /* Release allocated memory, putting the pointer in the right free list. 626 | * On success 0 is returned. On error -1. */ 627 | int btree_free(struct btree *bt, uint64_t ptr) { 628 | uint64_t size; 629 | uint32_t realsize; 630 | int fli, exp; 631 | struct btree_freelist *fl; 632 | 633 | if (btree_pread_u64(bt,&size,ptr-sizeof(uint64_t)) == -1) return -1; 634 | realsize = btree_alloc_realsize(size); 635 | exp = btree_log_two(realsize); 636 | printf("Free %llu bytes (realsize: %llu)\n", size, (uint64_t) realsize); 637 | 638 | fli = btree_freelist_index_by_exp(exp); 639 | fl = &bt->freelist[fli]; 640 | 641 | /* We need special handling when freeing an allocation that is the same 642 | * size of the freelist block, and the latest free list block for that size 643 | * is full. Without this special handling what happens is that we need 644 | * to allocate a new block of the same size to make space, but doing so 645 | * would result in an element removed from the latest block, so after we 646 | * link the new block we have the previous block that is not full. 647 | * 648 | * Check BTREE.txt in this source distribution for more information. */ 649 | if (fl->last_items == BTREE_FREELIST_BLOCK_ITEMS && 650 | exp == BTREE_FREELIST_SIZE_EXP) 651 | { 652 | /* Just use the freed allocation as the next free block */ 653 | fl->blocks = realloc(fl->blocks,sizeof(uint64_t)*(fl->numblocks+1)); 654 | if (fl->blocks == NULL) return -1; 655 | fl->blocks[fl->numblocks] = ptr; 656 | fl->numblocks++; 657 | fl->last_items = 0; 658 | /* Init block setting items count, next pointer, prev pointer. */ 659 | btree_pwrite_u64(bt,0,ptr+sizeof(uint64_t)); /* next */ 660 | btree_pwrite_u64(bt,fl->blocks[fl->numblocks-2],ptr); /* prev */ 661 | btree_pwrite_u64(bt,0,ptr+sizeof(uint64_t)*2); /* numitems */ 662 | btree_sync(bt); /* Make sure it's ok before linking it to prev block */ 663 | /* Link this new block to the free list blocks updating next pointer 664 | * of the previous block. */ 665 | btree_pwrite_u64(bt,ptr,fl->blocks[fl->numblocks-2]+sizeof(uint64_t)); 666 | btree_sync(bt); 667 | } else { 668 | /* Allocate a new block if needed */ 669 | if (fl->last_items == BTREE_FREELIST_BLOCK_ITEMS) { 670 | uint64_t newblock; 671 | 672 | newblock = btree_alloc(bt,BTREE_FREELIST_BLOCK_SIZE); 673 | if (newblock == 0) return -1; 674 | 675 | fl->blocks = realloc(fl->blocks,sizeof(uint64_t)*(fl->numblocks+1)); 676 | if (fl->blocks == NULL) return -1; 677 | fl->blocks[fl->numblocks] = newblock; 678 | fl->numblocks++; 679 | fl->last_items = 0; 680 | /* Init block setting items count, next pointer, prev pointer. */ 681 | btree_pwrite_u64(bt,0,newblock+sizeof(uint64_t)); /* next */ 682 | btree_pwrite_u64(bt,fl->blocks[fl->numblocks-2],newblock);/* prev */ 683 | btree_pwrite_u64(bt,0,newblock+sizeof(uint64_t)*2); /* numitems */ 684 | btree_sync(bt); /* Make sure it's ok before linking it. */ 685 | /* Link this new block to the free list blocks updating next pointer 686 | * of the previous block. */ 687 | btree_pwrite_u64(bt,newblock,fl->blocks[fl->numblocks-2]+sizeof(uint64_t)); 688 | btree_sync(bt); 689 | } 690 | /* Add the item */ 691 | fl->last_block[fl->last_items] = ptr-sizeof(uint64_t); 692 | fl->last_items++; 693 | /* Write the pointer in the block first */ 694 | printf("Write freelist item about ptr %llu at %llu\n", 695 | ptr, fl->blocks[fl->numblocks-1]+(sizeof(uint64_t)*3) 696 | +(sizeof(uint64_t)*(fl->last_items-1))); 697 | btree_pwrite_u64(bt,ptr-sizeof(uint64_t),fl->blocks[fl->numblocks-1]+(sizeof(uint64_t)*3)+(sizeof(uint64_t)*(fl->last_items-1))); 698 | btree_sync(bt); 699 | /* Then write the items count. */ 700 | printf("Write the new count for block %lld: %lld at %lld\n", 701 | fl->blocks[fl->numblocks-1], 702 | (uint64_t) fl->last_items, 703 | fl->blocks[fl->numblocks-1]+sizeof(uint64_t)*2); 704 | btree_pwrite_u64(bt,fl->last_items,fl->blocks[fl->numblocks-1]+sizeof(uint64_t)*2); 705 | btree_sync(bt); 706 | } 707 | return 0; 708 | } 709 | 710 | /* --------------------------- btree operations ---------------------------- */ 711 | 712 | int btree_node_is_full(struct btree_node *n) { 713 | return n->numkeys == BTREE_MAX_KEYS; 714 | } 715 | 716 | /* Add a key at the specified position 'i' inside an in-memory node. 717 | * All the other keys starting from the old key at position 'i' are 718 | * shifted one position to the right. 719 | * 720 | * Note: this function does not change the position of the children as it 721 | * is intented to be used only on leafs. */ 722 | void btree_node_insert_key_at(struct btree_node *n, int i, unsigned char *key, uint64_t valoff) { 723 | void *p; 724 | 725 | p = n->keys + (i*BTREE_HASHED_KEY_LEN); 726 | memmove(p+BTREE_HASHED_KEY_LEN,p,(n->numkeys-i)*BTREE_HASHED_KEY_LEN); 727 | memmove(n->values+i+1,n->values+i,(n->numkeys-i)*8); 728 | memcpy(p,key,BTREE_HASHED_KEY_LEN); 729 | n->values[i] = valoff; 730 | n->numkeys++; 731 | } 732 | 733 | /* Insert a key (and associated value) into a non full node. 734 | * If the node is a leaf the key can be inserted in the current node otherwise 735 | * we need to walk the three, possibly splitting full nodes as we descend. 736 | * 737 | * The nodeptr is the offset of the node we want to insert into. 738 | * 739 | * Pointedby is the offset on disk inside the parent of the node pointed by 740 | * 'nodeptr'. As we always write new full nodes instead of modifying old ones 741 | * in order to be more crash proof, we need to update the pointer in the 742 | * parent node when everything is ready. 743 | * 744 | * The function returns 0 on success, and -1 on error. 745 | * On error errno is set accordingly, and may also assume the following values: 746 | * 747 | * EFAULT if the btree seems corrupted. 748 | * EEXIST if the key already exists. 749 | */ 750 | int btree_add_nonfull(struct btree *bt, uint64_t nodeptr, uint64_t pointedby, unsigned char *key, unsigned char *val, size_t vlen, int replace) { 751 | struct btree_node *n = NULL; 752 | int i, found = 0; 753 | 754 | if ((n = btree_read_node(bt,nodeptr)) == NULL) return -1; 755 | i = n->numkeys-1; 756 | 757 | /* Seek to the right position in the current node */ 758 | while(1) { 759 | int cmp; 760 | 761 | if (i < 0) break; 762 | cmp = memcmp(key,n->keys+i*BTREE_HASHED_KEY_LEN,BTREE_HASHED_KEY_LEN); 763 | if (cmp == 0) { 764 | found = 1; /* the key is already present in the btree */ 765 | break; 766 | } 767 | if (cmp >= 0) break; 768 | i--; 769 | } 770 | 771 | /* Key already present? Replace it with the new value if replace is true 772 | * otherwise return an error. */ 773 | if (found) { 774 | if (!replace) { 775 | errno = EBUSY; 776 | return -1; 777 | } else { 778 | uint64_t oldvaloff = n->values[i]; 779 | uint64_t newvaloff; 780 | 781 | if ((newvaloff = btree_alloc(bt,vlen)) == 0) goto err; 782 | if (btree_pwrite(bt,val,vlen,newvaloff) == -1) goto err; 783 | btree_sync(bt); 784 | /* Overwrite the pointer to the old value off with the new one. */ 785 | if (btree_pwrite_u64(bt,newvaloff,nodeptr+16+(BTREE_HASHED_KEY_LEN*BTREE_MAX_KEYS)+(8*i)) == -1) goto err; 786 | /* Finally we can free the old value, and the in memory node. */ 787 | btree_free(bt,oldvaloff); 788 | btree_free_node(n); 789 | return 0; 790 | } 791 | } 792 | 793 | if (n->isleaf) { 794 | uint64_t newoff; /* New node offset */ 795 | uint64_t valoff; /* Value offset on disk */ 796 | 797 | /* Write the value on disk */ 798 | if ((valoff = btree_alloc(bt,vlen)) == 0) goto err; 799 | if (btree_pwrite(bt,val,vlen,valoff) == -1) goto err; 800 | /* Insert the new key in place, and a pointer to the value. */ 801 | btree_node_insert_key_at(n,i+1,key,valoff); 802 | /* Write the modified node to disk */ 803 | if ((newoff = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 804 | if (btree_write_node(bt,n,newoff) == -1) goto err; 805 | /* Update the pointer pointing to this node with the new node offset. */ 806 | if (btree_pwrite_u64(bt,newoff,pointedby) == -1) goto err; 807 | if (pointedby == BTREE_HDR_ROOTPTR_POS) bt->rootptr = newoff; 808 | /* Free the old node on disk */ 809 | if (btree_free(bt,nodeptr) == -1) goto err; 810 | btree_free_node(n); 811 | } else { 812 | struct btree_node *child; 813 | uint64_t newnode; 814 | 815 | i++; 816 | if ((child = btree_read_node(bt,n->children[i])) == NULL) return -1; 817 | if (btree_node_is_full(child)) { 818 | if (btree_split_child(bt,pointedby,nodeptr,i,n->children[i], 819 | &newnode) == -1) 820 | { 821 | btree_free_node(child); 822 | goto err; 823 | } 824 | } else { 825 | pointedby = nodeptr+16+BTREE_HASHED_KEY_LEN*BTREE_MAX_KEYS+8*BTREE_MAX_KEYS+8*i; 826 | newnode = n->children[i]; 827 | /* Fixme, here we can set 'n' to 'child' and tail-recurse with 828 | * a goto, to avoid re-reading the same node again. */ 829 | } 830 | btree_free_node(n); 831 | btree_free_node(child); 832 | return btree_add_nonfull(bt,newnode,pointedby,key,val,vlen,replace); 833 | } 834 | return 0; 835 | 836 | err: 837 | btree_free_node(n); 838 | return -1; 839 | } 840 | 841 | /* Split child, that is the i-th child of parent. 842 | * We'll write three new nodes, two to split the original child in two nodes 843 | * and one containing the updated parent. 844 | * Finally we'll set 'pointedby' to the offset of the new parent. So 845 | * pointedby must point to the offset where the parent is referenced on disk, 846 | * that is the root pointer heeader if it's the root node, or the right offset 847 | * inside its parent (that is, the parent of the parent). */ 848 | int btree_split_child(struct btree *bt, uint64_t pointedby, uint64_t parentoff, 849 | int i, uint64_t childoff, uint64_t *newparent) 850 | { 851 | struct btree_node *lnode = NULL, *rnode = NULL; 852 | struct btree_node *child = NULL, *parent = NULL; 853 | int halflen = (BTREE_MAX_KEYS-1)/2; 854 | uint64_t loff, roff, poff; /* new left, right, parent nodes offets. */ 855 | 856 | /* Read parent and child from disk. 857 | * Also creates new nodes in memory, lnode and rnode, that will be 858 | * the nodes produced splitting the child into two nodes. */ 859 | if ((parent = btree_read_node(bt,parentoff)) == NULL) goto err; 860 | if ((child = btree_read_node(bt,childoff)) == NULL) goto err; 861 | if ((lnode = btree_create_node()) == NULL) goto err; 862 | if ((rnode = btree_create_node()) == NULL) goto err; 863 | /* Two fundamental conditions that must be always true */ 864 | assert(child->numkeys == BTREE_MAX_KEYS); 865 | assert(parent->numkeys != BTREE_MAX_KEYS); 866 | /* Split the child into lnode and rnode */ 867 | memcpy(lnode->keys,child->keys,BTREE_HASHED_KEY_LEN*halflen); 868 | memcpy(lnode->values,child->values,8*halflen); 869 | memcpy(lnode->children,child->children,8*(halflen+1)); 870 | lnode->numkeys = halflen; 871 | lnode->isleaf = child->isleaf; 872 | /* And the rnode */ 873 | memcpy(rnode->keys,child->keys+BTREE_HASHED_KEY_LEN*(halflen+1), 874 | BTREE_HASHED_KEY_LEN*halflen); 875 | memcpy(rnode->values,child->values+halflen+1,8*halflen); 876 | memcpy(rnode->children,child->children+halflen+1,8*(halflen+1)); 877 | rnode->numkeys = halflen; 878 | rnode->isleaf = child->isleaf; 879 | /* Save left and right children on disk */ 880 | if ((loff = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 881 | if ((roff = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 882 | if (btree_write_node(bt,lnode,loff) == -1) goto err; 883 | if (btree_write_node(bt,rnode,roff) == -1) goto err; 884 | 885 | /* Now fix the parent node: 886 | * let's move the child's median key into the parent. 887 | * Shift the current keys, values, and child pointers. */ 888 | memmove(parent->keys+BTREE_HASHED_KEY_LEN*(i+1), 889 | parent->keys+BTREE_HASHED_KEY_LEN*i, 890 | (parent->numkeys-i)*BTREE_HASHED_KEY_LEN); 891 | memmove(parent->values+i+1,parent->values+i,(parent->numkeys-i)*8); 892 | memmove(parent->children+i+2,parent->children+i+1,(parent->numkeys-i)*8); 893 | /* Set the key and left and right children */ 894 | memcpy(parent->keys+BTREE_HASHED_KEY_LEN*i, 895 | child->keys+BTREE_HASHED_KEY_LEN*halflen,BTREE_HASHED_KEY_LEN); 896 | parent->values[i] = child->values[halflen]; 897 | parent->children[i] = loff; 898 | parent->children[i+1] = roff; 899 | parent->numkeys++; 900 | /* Write the parent on disk */ 901 | if ((poff = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 902 | if (btree_write_node(bt,parent,poff) == -1) goto err; 903 | if (newparent) *newparent = poff; 904 | /* Now link the new nodes to the old btree */ 905 | btree_sync(bt); /* Make sure the nodes are flushed */ 906 | if (btree_pwrite_u64(bt,poff,pointedby) == -1) goto err; 907 | if (pointedby == BTREE_HDR_ROOTPTR_POS) bt->rootptr = poff; 908 | /* Finally reclaim the space used by the old nodes */ 909 | btree_free(bt,parentoff); 910 | btree_free(bt,childoff); 911 | 912 | btree_free_node(lnode); 913 | btree_free_node(rnode); 914 | btree_free_node(parent); 915 | btree_free_node(child); 916 | return 0; 917 | 918 | err: 919 | btree_free_node(lnode); 920 | btree_free_node(rnode); 921 | btree_free_node(parent); 922 | btree_free_node(child); 923 | return -1; 924 | } 925 | 926 | int btree_add(struct btree *bt, unsigned char *key, unsigned char *val, size_t vlen, int replace) { 927 | struct btree_node *root; 928 | 929 | if ((root = btree_read_node(bt,bt->rootptr)) == NULL) return -1; 930 | 931 | if (btree_node_is_full(root)) { 932 | uint64_t rootptr; 933 | 934 | /* Root is full. Split it. */ 935 | btree_free_node(root); 936 | root = NULL; 937 | /* Create a fresh node on disk: will be our new root. */ 938 | if ((root = btree_create_node()) == NULL) return -1; 939 | if ((rootptr = btree_alloc(bt,BTREE_NODE_SIZE)) == 0) goto err; 940 | if (btree_write_node(bt,root,rootptr) == -1) goto err; 941 | btree_free_node(root); 942 | /* Split it */ 943 | if (btree_split_child(bt,BTREE_HDR_ROOTPTR_POS,rootptr,0,bt->rootptr,NULL) == -1) goto err; 944 | } else { 945 | btree_free_node(root); 946 | } 947 | return btree_add_nonfull(bt,bt->rootptr,BTREE_HDR_ROOTPTR_POS,key,val,vlen,replace); 948 | 949 | err: 950 | btree_free_node(root); 951 | return -1; 952 | } 953 | 954 | /* Find a record by key. 955 | * The function seraches for the specified key. If the key is found 956 | * 0 is returned, and *voff is set to the offset of the value on disk. 957 | * 958 | * On error -1 is returned and errno set accordingly. 959 | * 960 | * Non existing key is considered an error with errno = ENOENT. */ 961 | int btree_find(struct btree *bt, unsigned char *key, uint64_t *voff) { 962 | struct btree_node *n; 963 | uint64_t nptr = bt->rootptr; 964 | unsigned int j; 965 | 966 | while(1) { 967 | int cmp; 968 | 969 | if ((n = btree_read_node(bt,nptr)) == NULL) return -1; 970 | for (j = 0; j < n->numkeys; j++) { 971 | cmp = memcmp(key,n->keys+BTREE_HASHED_KEY_LEN*j, 972 | BTREE_HASHED_KEY_LEN); 973 | if (cmp <= 0) break; 974 | } 975 | if (j < n->numkeys && cmp == 0) { 976 | if (voff) *voff = n->values[j]; 977 | btree_free_node(n); 978 | return 0; 979 | } 980 | if (n->isleaf || n->children[j] == 0) { 981 | btree_free_node(n); 982 | errno = ENOENT; 983 | return -1; 984 | } 985 | nptr = n->children[j]; 986 | btree_free_node(n); 987 | } 988 | } 989 | 990 | /* Just a debugging function to check what's inside the whole btree... */ 991 | void btree_walk_rec(struct btree *bt, uint64_t nodeptr, int level) { 992 | struct btree_node *n; 993 | unsigned int j; 994 | 995 | n = btree_read_node(bt,nodeptr); 996 | if (n == NULL) { 997 | printf("Error walking the btree: %s\n", strerror(errno)); 998 | return; 999 | } 1000 | for (j = 0; j < n->numkeys; j++) { 1001 | char *data; 1002 | uint32_t datalen; 1003 | int k; 1004 | 1005 | if (n->children[j] != 0) { 1006 | btree_walk_rec(bt,n->children[j],level+1); 1007 | } 1008 | for (k = 0; k < level; k++) printf(" "); 1009 | printf("(@%llu) Key %20s: ", nodeptr, n->keys+(j*BTREE_HASHED_KEY_LEN)); 1010 | btree_alloc_size(bt,&datalen,n->values[j]); 1011 | data = malloc(datalen+1); 1012 | btree_pread(bt,data,datalen,n->values[j]); 1013 | data[datalen] = '\0'; 1014 | printf("@%llu %lu bytes: %s\n", 1015 | n->values[j], 1016 | (unsigned long)datalen, data); 1017 | free(data); 1018 | } 1019 | if (n->children[j] != 0) { 1020 | btree_walk_rec(bt,n->children[j], level+1); 1021 | } 1022 | } 1023 | 1024 | void btree_walk(struct btree *bt, uint64_t nodeptr) { 1025 | btree_walk_rec(bt,nodeptr,0); 1026 | } 1027 | -------------------------------------------------------------------------------- /btree.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011, Salvatore Sanfilippo 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of Redis nor the names of its contributors may be used 14 | * to endorse or promote products derived from this software without 15 | * specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | * POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include 31 | #include 32 | 33 | #define BTREE_CREAT 1 34 | 35 | #define BTREE_PREALLOC_SIZE (1024*512) 36 | #define BTREE_FREELIST_BLOCK_ITEMS 252 37 | #define BTREE_MIN_KEYS 4 38 | //#define BTREE_MAX_KEYS 255 39 | #define BTREE_MAX_KEYS 7 40 | #define BTREE_HASHED_KEY_LEN 16 41 | 42 | /* We have free lists for the following sizes: 43 | * 16 32 64 128 256 512 1024 2048 4096 8192 16k 32k 64k 128k 256k 512k 1M 2M 4M 8M 16M 32M 64M 128M 256M 512M 1G 2G */ 44 | #define BTREE_FREELIST_COUNT 28 45 | 46 | /* A free list block is composed of 2 pointers (prev, next), one count 47 | * (numitems), and a pointer for every free list item inside. */ 48 | #define BTREE_FREELIST_BLOCK_SIZE ((8*3)+(8*BTREE_FREELIST_BLOCK_ITEMS)) 49 | #define BTREE_FREELIST_SIZE_EXP 11 /* 2^11 = 2048 */ 50 | 51 | /* A node is composed of: 52 | * one count (startmark), 53 | * one count (numkeys), 54 | * one count (isleaf), 55 | * BTREE_MAX_KEYS keys (16 bytes for each key, as our keys are fixed size), 56 | * BTREE_MAX_KEYS pointers to values, 57 | * BTREE_MAX_KEYS+1 child pointers, 58 | * and a final count(endmark) */ 59 | #define BTREE_NODE_SIZE (4*4+BTREE_MAX_KEYS*BTREE_HASHED_KEY_LEN+((BTREE_MAX_KEYS*2)+1)*8+4) 60 | 61 | /* Offsets inside the file of the 'free' and 'freeoff' fields */ 62 | #define BTREE_HDR_FREE_POS 16 63 | #define BTREE_HDR_FREEOFF_POS 24 64 | #define BTREE_HDR_ROOTPTR_POS (32+(BTREE_FREELIST_BLOCK_SIZE*BTREE_FREELIST_COUNT)) 65 | 66 | /* ------------------------------ VFS Layer --------------------------------- */ 67 | 68 | struct btree_vfs { 69 | void *(*open) (char *path, int flags); 70 | void (*close) (void *vfs_handle); 71 | ssize_t (*pread) (void *vfs_handle, void *buf, uint32_t nbytes, 72 | uint64_t offset); 73 | ssize_t (*pwrite) (void *vfs_handle, const void *buf, uint32_t nbytes, 74 | uint64_t offset); 75 | int (*resize) (void *vfs_handle, uint64_t length); 76 | int (*getsize) (void *vfs_handle, uint64_t *size); 77 | void (*sync) (void *vfs_handle); 78 | }; 79 | 80 | extern struct btree_vfs bvfs_unistd; 81 | 82 | /* ------------------------------ ALLOCATOR --------------------------------- */ 83 | 84 | struct btree_freelist { 85 | uint32_t numblocks; /* number of freelist blocks */ 86 | uint64_t *blocks; /* blocks offsets. last is block[numblocks-1] */ 87 | uint32_t last_items; /* number of items in the last block */ 88 | uint64_t last_block[BTREE_FREELIST_BLOCK_ITEMS]; /* last block cached */ 89 | }; 90 | 91 | /* -------------------------------- BTREE ----------------------------------- */ 92 | 93 | #define BTREE_FLAG_NOFLAG 0 94 | #define BTREE_FLAG_USE_WRITE_BARRIER 1 95 | 96 | /* This is our btree object, returned to the client when the btree is 97 | * opened, and used as first argument for all the btree API. */ 98 | struct btree { 99 | struct btree_vfs *vfs; /* Our VFS API */ 100 | void *vfs_handle; /* The open VFS resource */ 101 | /* Our free lists, from 4 bytes to 4 gigabytes, so freelist[0] is for 102 | * size 4, and freelist[BTREE_FREELIST_COUNT-1] is for 4GB. */ 103 | struct btree_freelist freelist[BTREE_FREELIST_COUNT]; 104 | /* We pre-allocate free space at the end of the file, as a room for 105 | * the allocator. Amount and location of free space is handled 106 | * by the following fields: */ 107 | uint64_t free; /* Amount of free space starting at freeoff */ 108 | uint64_t freeoff; /* Offset where free space starts */ 109 | uint64_t rootptr; /* Root node pointer */ 110 | uint32_t mark; /* This incremental number is used for 111 | nodes start/end mark to detect corruptions. */ 112 | int flags; /* BTREE_FLAG_* */ 113 | }; 114 | 115 | /* In memory representation of a btree node. We manipulate this in memory 116 | * representation in order to avoid to deal with too much disk operations 117 | * and complexities. Once a node was modified it can be written back to disk 118 | * using btree_write_node. */ 119 | struct btree_node { 120 | uint32_t numkeys; 121 | uint32_t isleaf; 122 | char keys[BTREE_HASHED_KEY_LEN*BTREE_MAX_KEYS]; 123 | uint64_t values[BTREE_MAX_KEYS]; 124 | uint64_t children[BTREE_MAX_KEYS+1]; 125 | }; 126 | 127 | /* ---------------------------- EXPORTED API ------------------------------- */ 128 | 129 | /* Btree */ 130 | struct btree *btree_open(struct btree_vfs *vfs, char *path, int flags); 131 | void btree_close(struct btree *bt); 132 | void btree_set_flags(struct btree *bt, int flags); 133 | void btree_clear_flags(struct btree *bt, int flags); 134 | int btree_add(struct btree *bt, unsigned char *key, unsigned char *val, size_t vlen, int replace); 135 | int btree_find(struct btree *bt, unsigned char *key, uint64_t *voff); 136 | void btree_walk(struct btree *bt, uint64_t nodeptr); 137 | 138 | /* On disk allocator */ 139 | uint64_t btree_alloc(struct btree *bt, uint32_t size); 140 | int btree_free(struct btree *bt, uint64_t ptr); 141 | int btree_alloc_size(struct btree *bt, uint32_t *size, uint64_t ptr); 142 | ssize_t btree_pread(struct btree *bt, void *buf, uint32_t nbytes, 143 | uint64_t offset); 144 | -------------------------------------------------------------------------------- /btree_example.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011, Salvatore Sanfilippo 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * * Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of Redis nor the names of its contributors may be used 14 | * to endorse or promote products derived from this software without 15 | * specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 21 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 22 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 23 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 24 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 25 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 26 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 | * POSSIBILITY OF SUCH DAMAGE. 28 | */ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "btree.h" 36 | 37 | #define OP_ALLOC 0 38 | #define OP_FREE 1 39 | #define OP_ALLOCFREE 2 40 | #define OP_ADD 3 41 | #define OP_WALK 4 42 | #define OP_FILL 5 43 | #define OP_FIND 6 44 | 45 | int main(int argc, char **argv) { 46 | struct btree *bt; 47 | uint64_t ptr; 48 | int j, count, op, arg; 49 | 50 | if (argc != 4) { 51 | fprintf(stderr,"Usage: btree_example \n"); 52 | exit(1); 53 | } 54 | count = atoi(argv[3]); 55 | arg = atoi(argv[2]); 56 | if (!strcasecmp(argv[1],"alloc")) { 57 | op = OP_ALLOC; 58 | } else if (!strcasecmp(argv[1],"free")) { 59 | op = OP_FREE; 60 | } else if (!strcasecmp(argv[1],"allocfree")) { 61 | op = OP_ALLOCFREE; 62 | } else if (!strcasecmp(argv[1],"add")) { 63 | op = OP_ADD; 64 | } else if (!strcasecmp(argv[1],"walk")) { 65 | op = OP_WALK; 66 | } else if (!strcasecmp(argv[1],"fill")) { 67 | op = OP_FILL; 68 | } else if (!strcasecmp(argv[1],"find")) { 69 | op = OP_FIND; 70 | } else { 71 | printf("not supported op %s\n", argv[1]); 72 | exit(1); 73 | } 74 | 75 | bt = btree_open(NULL, "./btree.db", BTREE_CREAT); 76 | btree_clear_flags(bt,BTREE_FLAG_USE_WRITE_BARRIER); 77 | if (bt == NULL) { 78 | perror("btree_open"); 79 | exit(1); 80 | } 81 | 82 | for (j = 0; j < count; j++) { 83 | if (op == OP_ALLOC) { 84 | ptr = btree_alloc(bt,arg); 85 | printf("PTR: %llu\n", ptr); 86 | } else if (op == OP_FREE) { 87 | btree_free(bt,arg); 88 | } else if (op == OP_ALLOCFREE) { 89 | ptr = btree_alloc(bt,arg); 90 | printf("PTR: %llu\n", ptr); 91 | btree_free(bt,ptr); 92 | } 93 | } 94 | 95 | if (op == OP_ADD) { 96 | int retval; 97 | char key[16]; 98 | memset(key,0,16); 99 | strcpy(key,argv[2]); 100 | 101 | retval = btree_add(bt,(unsigned char*)key, 102 | (unsigned char*)argv[3],strlen(argv[3]),1); 103 | printf("retval %d\n", retval); 104 | if (retval == -1) { 105 | printf("Error: %s\n", strerror(errno)); 106 | } 107 | } else if (op == OP_WALK) { 108 | btree_walk(bt,bt->rootptr); 109 | } else if (op == OP_FILL) { 110 | for (j = 0; j < count; j++) { 111 | int r = random()%arg; 112 | int retval; 113 | char key[64]; 114 | char val[64]; 115 | 116 | memset(key,0,64); 117 | snprintf(key,64,"k%d",r); 118 | snprintf(val,64,"val:%d",r); 119 | retval = btree_add(bt,(unsigned char*)key, 120 | (unsigned char*)val, strlen(val), 1); 121 | if (retval == -1) { 122 | printf("Error: %s\n", strerror(errno)); 123 | goto err; 124 | } 125 | } 126 | } else if (op == OP_FIND) { 127 | int retval; 128 | char key[16], *data; 129 | memset(key,0,16); 130 | strcpy(key,argv[2]); 131 | uint64_t voff; 132 | uint32_t datalen; 133 | 134 | retval = btree_find(bt,(unsigned char*)key,&voff); 135 | if (retval == -1) { 136 | if (errno == ENOENT) { 137 | printf("Key not found\n"); 138 | exit(0); 139 | } else { 140 | perror("Error searching for key"); 141 | exit(1); 142 | } 143 | } 144 | printf("Key found at %llu\n", voff); 145 | 146 | btree_alloc_size(bt,&datalen,voff); 147 | data = malloc(datalen+1); 148 | btree_pread(bt,(unsigned char*)data,datalen,voff); 149 | data[datalen] = '\0'; 150 | printf("Value: %s\n", data); 151 | free(data); 152 | } 153 | btree_close(bt); 154 | return 0; 155 | 156 | err: 157 | btree_close(bt); 158 | return 1; 159 | } 160 | --------------------------------------------------------------------------------