├── .gitignore ├── Makefile.am ├── README ├── btest.c ├── btree.3 ├── btree.c ├── btree.h ├── configure.ac ├── include └── sys │ ├── queue.h │ └── tree.h ├── sha1tiny.c ├── sha1tiny.h ├── test_btree.c └── test_sha1tiny.c /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | *~ 3 | *.o 4 | *.a 5 | *.so 6 | *.exe 7 | # autotools 8 | Makefile.in 9 | aclocal.m4 10 | autom4te.cache/ 11 | config.guess 12 | config.sub 13 | configure 14 | depcomp 15 | install-sh 16 | missing 17 | .deps/ 18 | Makefile 19 | config.log 20 | config.status 21 | 22 | # executables and test outputs 23 | btest 24 | test_btree 25 | test_sha1tiny 26 | test.db 27 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | AUTOMAKE_OPTIONS = subdir-objects 2 | 3 | lib_LIBRARIES = libbtree.a 4 | 5 | libbtree_a_SOURCES = btree.c sha1tiny.c 6 | 7 | include_HEADERS = btree.h 8 | 9 | man3_MANS = btree.3 10 | 11 | noinst_PROGRAMS = btest test_btree test_sha1tiny 12 | 13 | btest_SOURCES = btest.c 14 | btest_LDADD = libbtree.a 15 | 16 | test_btree_SOURCES = test_btree.c 17 | test_btree_LDADD = libbtree.a 18 | 19 | test_sha1tiny_SOURCES = test_sha1tiny.c sha1tiny.c 20 | 21 | AM_CFLAGS = -Wall -W -g -std=c89 -D_XOPEN_SOURCE=700 22 | AM_CFLAGS += -D_GNU_SOURCE # asprintf 23 | AM_CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 24 | AM_CFLAGS += -I$(top_srcdir)/include 25 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | Building 2 | ======== 3 | autoreconf -i 4 | ./configure 5 | make 6 | -------------------------------------------------------------------------------- /btest.c: -------------------------------------------------------------------------------- 1 | /* $OpenBSD: btest.c,v 1.1 2010/05/31 17:36:31 martinh Exp $ */ 2 | 3 | /* Simple test program for the btree database. */ 4 | /* 5 | * Copyright (c) 2009 Martin Hedenfalk 6 | * 7 | * Permission to use, copy, modify, and distribute this software for any 8 | * purpose with or without fee is hereby granted, provided that the above 9 | * copyright notice and this permission notice appear in all copies. 10 | * 11 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 12 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 13 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 14 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 15 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 16 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 17 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 18 | */ 19 | 20 | #include 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "btree.h" 28 | 29 | int 30 | main(int argc, char **argv) 31 | { 32 | int c, rc = BT_FAIL; 33 | unsigned int flags = 0; 34 | struct btree *bt; 35 | struct cursor *cursor; 36 | const char *filename = "test.db"; 37 | struct btval key, data, maxkey; 38 | 39 | while ((c = getopt(argc, argv, "rf:")) != -1) { 40 | switch (c) { 41 | case 'r': 42 | flags |= BT_REVERSEKEY; 43 | break; 44 | case 'f': 45 | filename = optarg; 46 | break; 47 | } 48 | } 49 | 50 | argc -= optind; 51 | argv += optind; 52 | 53 | if (argc == 0) 54 | errx(1, "missing command"); 55 | 56 | bt = btree_open(filename, flags | BT_NOSYNC, 0644); 57 | if (bt == NULL) 58 | err(1, filename); 59 | 60 | bzero(&key, sizeof(key)); 61 | bzero(&data, sizeof(data)); 62 | bzero(&maxkey, sizeof(maxkey)); 63 | 64 | if (strcmp(argv[0], "put") == 0) { 65 | if (argc < 3) 66 | errx(1, "missing arguments"); 67 | key.data = argv[1]; 68 | key.size = strlen(key.data); 69 | data.data = argv[2]; 70 | data.size = strlen(data.data); 71 | rc = btree_put(bt, &key, &data, 0); 72 | if (rc == BT_SUCCESS) 73 | printf("OK\n"); 74 | else 75 | printf("FAIL\n"); 76 | } else if (strcmp(argv[0], "del") == 0) { 77 | if (argc < 1) 78 | errx(1, "missing argument"); 79 | key.data = argv[1]; 80 | key.size = strlen(key.data); 81 | rc = btree_del(bt, &key, NULL); 82 | if (rc == BT_SUCCESS) 83 | printf("OK\n"); 84 | else 85 | printf("FAIL\n"); 86 | } else if (strcmp(argv[0], "get") == 0) { 87 | if (argc < 2) 88 | errx(1, "missing arguments"); 89 | key.data = argv[1]; 90 | key.size = strlen(key.data); 91 | rc = btree_get(bt, &key, &data); 92 | if (rc == BT_SUCCESS) { 93 | printf("OK %.*s\n", (int)data.size, (char *)data.data); 94 | } else { 95 | printf("FAIL\n"); 96 | } 97 | } else if (strcmp(argv[0], "scan") == 0) { 98 | if (argc > 1) { 99 | key.data = argv[1]; 100 | key.size = strlen(key.data); 101 | flags = BT_CURSOR; 102 | } 103 | else 104 | flags = BT_FIRST; 105 | if (argc > 2) { 106 | maxkey.data = argv[2]; 107 | maxkey.size = strlen(key.data); 108 | } 109 | 110 | cursor = btree_cursor_open(bt); 111 | while ((rc = btree_cursor_get(cursor, &key, &data, 112 | flags)) == BT_SUCCESS) { 113 | if (argc > 2 && btree_cmp(bt, &key, &maxkey) > 0) 114 | break; 115 | printf("OK %zi %.*s\n", 116 | key.size, (int)key.size, (char *)key.data); 117 | flags = BT_NEXT; 118 | } 119 | btree_cursor_close(cursor); 120 | } else if (strcmp(argv[0], "compact") == 0) { 121 | if ((rc = btree_compact(bt)) != BT_SUCCESS) 122 | warn("compact"); 123 | } else if (strcmp(argv[0], "revert") == 0) { 124 | if ((rc = btree_revert(bt)) != BT_SUCCESS) 125 | warn("revert"); 126 | } else 127 | errx(1, "%s: invalid command", argv[0]); 128 | 129 | btree_close(bt); 130 | 131 | return rc; 132 | } 133 | 134 | -------------------------------------------------------------------------------- /btree.3: -------------------------------------------------------------------------------- 1 | .\" $OpenBSD: btree.3,v 1.3 2010/11/03 17:30:01 martinh Exp $ 2 | .\" 3 | .\" Copyright (c) 2009, 2010 Martin Hedenfalk 4 | .\" 5 | .\" Permission to use, copy, modify, and distribute this software for any 6 | .\" purpose with or without fee is hereby granted, provided that the above 7 | .\" copyright notice and this permission notice appear in all copies. 8 | .\" 9 | .\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 | .\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 | .\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 | .\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 | .\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 | .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 | .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 | .\" 17 | .Dd $Mdocdate: November 3 2010 $ 18 | .Dt BTREE 3 19 | .Os 20 | .Sh NAME 21 | .Nm btree_open , 22 | .Nm btree_open_fd , 23 | .Nm btree_close , 24 | .Nm btree_txn_begin , 25 | .Nm btree_txn_get , 26 | .Nm btree_txn_put , 27 | .Nm btree_txn_del , 28 | .Nm btree_txn_commit , 29 | .Nm btree_txn_abort , 30 | .Nm btree_get , 31 | .Nm btree_put , 32 | .Nm btree_del , 33 | .Nm btree_txn_cursor_open , 34 | .Nm btree_cursor_open , 35 | .Nm btree_cursor_close , 36 | .Nm btree_cursor_get , 37 | .Nm btree_stat , 38 | .Nm btree_compact , 39 | .Nm btree_revert , 40 | .Nm btree_sync , 41 | .Nm btree_set_cache_size , 42 | .Nm btree_get_flags , 43 | .Nm btree_get_path , 44 | .Nm btree_cmp , 45 | .Nm btval_reset 46 | .Nd Append-only prefix B+Tree database library. 47 | .Sh SYNOPSIS 48 | .Fd #include 49 | .Ft "struct btree *" 50 | .Fn "btree_open_fd" "int fd" "unsigned int flags" 51 | .Ft "struct btree *" 52 | .Fn "btree_open" "const char *path" "unsigned int flags" "mode_t mode" 53 | .Ft "void" 54 | .Fn "btree_close" "struct btree *bt" 55 | .Ft "struct btree_txn *" 56 | .Fn "btree_txn_begin" "struct btree *bt" "int rdonly" 57 | .Ft "int" 58 | .Fn "btree_txn_get" "struct btree *bt" "struct btree_txn *" "struct btval *key" "struct btval *data" 59 | .Ft "int" 60 | .Fn "btree_txn_put" "struct btree *bt" "struct btree_txn *" "struct btval *key" "struct btval *data" "unsigned int flags" 61 | .Ft "int" 62 | .Fn "btree_txn_del" "struct btree *bt" "struct btree_txn *" "struct btval *key" "struct btval *data" 63 | .Ft "int" 64 | .Fn "btree_txn_commit" "struct btree_txn *txn" 65 | .Ft "void" 66 | .Fn "btree_txn_abort" "struct btree_txn *txn" 67 | .Ft "int" 68 | .Fn "btree_get" "struct btree *bt" "struct btval *key" "struct btval *data" 69 | .Ft "int" 70 | .Fn "btree_put" "struct btree *bt" "struct btval *key" "struct btval *data" "unsigned flags" 71 | .Ft "int" 72 | .Fn "btree_del" "struct btree *bt" "struct btval *key" "struct btval *data" 73 | .Ft "struct cursor *" 74 | .Fn "btree_txn_cursor_open" "struct btree *bt" "struct btree_txn *txn" 75 | .Ft "struct cursor *" 76 | .Fn "btree_cursor_open" "struct btree *bt" 77 | .Ft "void" 78 | .Fn "btree_cursor_close" "struct cursor *cursor" 79 | .Ft "int" 80 | .Fn "btree_cursor_get" "struct cursor *cursor" "struct btval *key" "struct btval *data" "enum cursor_op op" 81 | .Ft "struct btree_stat *" 82 | .Fn "btree_stat" "struct btree *bt" 83 | .Ft "int" 84 | .Fn "btree_compact" "struct btree *bt" 85 | .Ft "int" 86 | .Fn "btree_revert" "struct btree *bt" 87 | .Ft "int" 88 | .Fn "btree_sync" "struct btree *bt" 89 | .Ft "void" 90 | .Fn "btree_set_cache_size" "struct btree *bt" "unsigned int cache_size" 91 | .Ft "unsigned int" 92 | .Fn "btree_get_flags" "struct btree *bt" 93 | .Ft "const char *" 94 | .Fn "btree_get_path" "struct btree *bt" 95 | .Ft "int" 96 | .Fn "btree_cmp" "struct btree *bt" "const struct btval *a" "const struct btval *b" 97 | .Ft "void" 98 | .Fn "btval_reset" "struct btval *btv" 99 | .Sh DESCRIPTION 100 | The database is implemented as a modified prefix B+tree. 101 | Instead of modifying the database file inplace, 102 | each update appends any modified pages at the end of the file. 103 | The last block of the file contains metadata and a pointer to the root page. 104 | The first block of the file contains a header that specifies the page size. 105 | .Pp 106 | Append-only writing gives the following properties: 107 | .Bl -enum 108 | .It 109 | No locks. 110 | Since all writes are appended to the end of the file, multiple 111 | readers can continue reading from the tree as it was when they 112 | started. 113 | This snapshot view might contain outdated versions of entries. 114 | .It 115 | Resistance to corruption. 116 | The file content is never modified. 117 | When opening a database file, the last good meta-data page is searched 118 | by scanning backwards. 119 | If there is trailing garbage in the file, it will be skipped. 120 | .It 121 | Hot backups. 122 | Backups can be made on a running server simply by copying the files. 123 | There is no risk for inconsistencies. 124 | .El 125 | .Pp 126 | The drawback is that it wastes space. 127 | A 4-level B+tree database will write at least 5 new pages on each update, 128 | including the meta-data page. 129 | With 4 KiB pagesize, the file would grow by 20 KiB on each update. 130 | .Pp 131 | To reclaim the wasted space, the database should be compacted. 132 | The compaction process opens a write transaction and traverses the tree. 133 | Each active page is then written to a new file. 134 | When complete, a special 135 | .Dq tombstone 136 | page is written to the old file to 137 | signal that it is stale and all processes using the file should re-open it. 138 | Modifications are denied on a stale file and fail with errno set to ESTALE. 139 | .Sh CURSORS 140 | A new cursor may be opened with a call to 141 | .Fn btree_txn_cursor_open 142 | or 143 | .Fn btree_cursor_open . 144 | The latter is implemented as a macro to the former with a NULL 145 | .Ar txn 146 | argument. 147 | Multiple cursors may be open simultaneously. 148 | The cursor must be closed with 149 | .Fn btree_cursor_close 150 | after use. 151 | .Pp 152 | The cursor can be placed at a specific key by setting 153 | .Ar op 154 | to BT_CURSOR and filling in the 155 | .Ar key 156 | argument. 157 | The cursor will be placed at the smallest key greater or equal to 158 | the specified key. 159 | If 160 | .Ar op 161 | is instead set to BT_CURSOR_EXACT, the cursor will be placed at the 162 | specified key, or fail if it doesn't exist. 163 | .Pp 164 | The database may be traversed from the first key to the last by calling 165 | .Fn btree_cursor_get 166 | with 167 | .Ar op 168 | initially set to BT_FIRST and then set to BT_NEXT. 169 | If the cursor is not yet initialized, ie 170 | .Fn btree_cursor_get 171 | has not yet been called with 172 | .Ar op 173 | set to BT_FIRST or BT_CURSOR, then BT_NEXT behaves as BT_FIRST. 174 | .Sh TRANSACTIONS 175 | There are two types of transactions: write and read-only transactions. 176 | Only one write transaction is allowed at a time. 177 | A read-only transaction allows the grouping of several read operations 178 | to see a consistent state of the database. 179 | .Pp 180 | A transaction is started with 181 | .Fn btree_txn_begin . 182 | If the 183 | .Ar rdonly 184 | parameter is 0, a write transaction is started and an exclusive lock 185 | is taken on the file using 186 | .Xr flock 2 . 187 | No lock is taken for read-only transactions. 188 | .Pp 189 | The transaction is ended either with 190 | .Fn btree_txn_commit 191 | or 192 | .Fn btree_txn_abort . 193 | The 194 | .Ft btree_txn 195 | pointer must not be accessed afterwards. 196 | Any cursor opened inside the transaction must be closed before the 197 | transaction is ended. 198 | .Sh RETURN VALUES 199 | The 200 | .Fn btree_txn_get , 201 | .Fn btree_txn_put , 202 | .Fn btree_txn_del , 203 | .Fn btree_txn_commit , 204 | .Fn btree_get , 205 | .Fn btree_put , 206 | .Fn btree_del , 207 | .Fn btree_cursor_get , 208 | .Fn btree_compact 209 | and 210 | .Fn btree_revert 211 | functions all return 0 on success. 212 | On failure -1 is returned and errno is set. 213 | .Pp 214 | All functions returning pointers return NULL on error. 215 | .Pp 216 | .Fn btree_txn_put 217 | and 218 | .Fn btree_put 219 | sets errno to EEXIST if the key already exists and BT_NOOVERWRITE was not 220 | passed in the 221 | .Ar flags 222 | argument. 223 | .Pp 224 | .Fn btree_txn_get , 225 | .Fn btree_txn_del , 226 | .Fn btree_get , 227 | .Fn btree_del 228 | and 229 | .Fn btree_cursor_get 230 | sets errno to ENOENT if the specified key was not found. 231 | .Pp 232 | The 233 | .Fn btree_txn_begin , 234 | .Fn btree_cursor , 235 | .Fn btree_cursor_get , 236 | .Fn btree_get , 237 | .Fn btree_put , 238 | .Fn btree_del 239 | functions can fail and set errno to ESTALE if the database file has been 240 | compacted by another process. 241 | The file should be re-opened and the operation retried. 242 | .Sh AUTHORS 243 | The 244 | .Nm btree 245 | library was written by 246 | .An Martin Hedenfalk Aq martin@bzero.se 247 | .Sh BUGS 248 | Byte order is assumed never to change. 249 | -------------------------------------------------------------------------------- /btree.c: -------------------------------------------------------------------------------- 1 | /* $OpenBSD: btree.c,v 1.30 2010/09/01 12:13:21 martinh Exp $ */ 2 | 3 | /* 4 | * Copyright (c) 2009, 2010 Martin Hedenfalk 5 | * 6 | * Permission to use, copy, modify, and distribute this software for any 7 | * purpose with or without fee is hereby granted, provided that the above 8 | * copyright notice and this permission notice appear in all copies. 9 | * 10 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #ifdef HAVE_SYS_FILE_H 26 | #include 27 | #endif 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include "btree.h" 42 | 43 | /* from compat.h */ 44 | #ifndef __packed 45 | #define __packed __attribute__ ((__packed__)) 46 | #endif 47 | 48 | /* #define DEBUG */ 49 | 50 | #ifdef DEBUG 51 | # define DPRINTF(...) do { fprintf(stderr, "%s:%d: ", __func__, __LINE__); \ 52 | fprintf(stderr, __VA_ARGS__); \ 53 | fprintf(stderr, "\n"); } while(0) 54 | #else 55 | # define DPRINTF(...) do { } while(0) 56 | #endif 57 | 58 | #define PAGESIZE 4096 59 | #define BT_MINKEYS 4 60 | #define BT_MAGIC 0xB3DBB3DB 61 | #define BT_VERSION 4 62 | #define MAXKEYSIZE 255 63 | 64 | #define P_INVALID 0xFFFFFFFF 65 | 66 | #define F_ISSET(w, f) (((w) & (f)) == (f)) 67 | 68 | typedef uint32_t pgno_t; 69 | typedef uint16_t indx_t; 70 | 71 | /* There are four page types: meta, index, leaf and overflow. 72 | * They all share the same page header. 73 | */ 74 | struct page { /* represents an on-disk page */ 75 | pgno_t pgno; /* page number */ 76 | #define P_BRANCH 0x01 /* branch page */ 77 | #define P_LEAF 0x02 /* leaf page */ 78 | #define P_OVERFLOW 0x04 /* overflow page */ 79 | #define P_META 0x08 /* meta page */ 80 | #define P_HEAD 0x10 /* header page */ 81 | uint32_t flags; 82 | #define lower b.fb.fb_lower 83 | #define upper b.fb.fb_upper 84 | #define p_next_pgno b.pb_next_pgno 85 | union page_bounds { 86 | struct { 87 | indx_t fb_lower; /* lower bound of free space */ 88 | indx_t fb_upper; /* upper bound of free space */ 89 | } fb; 90 | pgno_t pb_next_pgno; /* overflow page linked list */ 91 | } b; 92 | indx_t ptrs[1]; /* dynamic size */ 93 | } __packed; 94 | 95 | #define PAGEHDRSZ offsetof(struct page, ptrs) 96 | 97 | #define NUMKEYSP(p) (((p)->lower - PAGEHDRSZ) >> 1) 98 | #define NUMKEYS(mp) (((mp)->page->lower - PAGEHDRSZ) >> 1) 99 | #define SIZELEFT(mp) (indx_t)((mp)->page->upper - (mp)->page->lower) 100 | #define PAGEFILL(bt, mp) (1000 * ((bt)->head.psize - PAGEHDRSZ - SIZELEFT(mp)) / \ 101 | ((bt)->head.psize - PAGEHDRSZ)) 102 | #define IS_LEAF(mp) F_ISSET((mp)->page->flags, P_LEAF) 103 | #define IS_BRANCH(mp) F_ISSET((mp)->page->flags, P_BRANCH) 104 | #define IS_OVERFLOW(mp) F_ISSET((mp)->page->flags, P_OVERFLOW) 105 | 106 | struct bt_head { /* header page content */ 107 | uint32_t magic; 108 | uint32_t version; 109 | uint32_t flags; 110 | uint32_t psize; /* page size */ 111 | } __packed; 112 | 113 | struct bt_meta { /* meta (footer) page content */ 114 | #define BT_TOMBSTONE 0x01 /* file is replaced */ 115 | uint32_t flags; 116 | pgno_t root; /* page number of root page */ 117 | pgno_t prev_meta; /* previous meta page number */ 118 | time_t created_at; 119 | uint32_t branch_pages; 120 | uint32_t leaf_pages; 121 | uint32_t overflow_pages; 122 | uint32_t revisions; 123 | uint32_t depth; 124 | uint64_t entries; 125 | unsigned char hash[SHA1_DIGEST_LENGTH]; 126 | } __packed; 127 | 128 | struct btkey { 129 | size_t len; 130 | char str[MAXKEYSIZE]; 131 | }; 132 | 133 | struct mpage { /* an in-memory cached page */ 134 | RB_ENTRY(mpage) entry; /* page cache entry */ 135 | SIMPLEQ_ENTRY(mpage) next; /* queue of dirty pages */ 136 | TAILQ_ENTRY(mpage) lru_next; /* LRU queue */ 137 | struct mpage *parent; /* NULL if root */ 138 | unsigned int parent_index; /* keep track of node index */ 139 | struct btkey prefix; 140 | struct page *page; 141 | pgno_t pgno; /* copy of page->pgno */ 142 | short ref; /* increased by cursors */ 143 | short dirty; /* 1 if on dirty queue */ 144 | }; 145 | RB_HEAD(page_cache, mpage); 146 | SIMPLEQ_HEAD(dirty_queue, mpage); 147 | TAILQ_HEAD(lru_queue, mpage); 148 | 149 | static int mpage_cmp(struct mpage *a, struct mpage *b); 150 | static struct mpage *mpage_lookup(struct btree *bt, pgno_t pgno); 151 | static void mpage_add(struct btree *bt, struct mpage *mp); 152 | static void mpage_free(struct mpage *mp); 153 | static void mpage_del(struct btree *bt, struct mpage *mp); 154 | static void mpage_flush(struct btree *bt); 155 | static struct mpage *mpage_copy(struct btree *bt, struct mpage *mp); 156 | static void mpage_prune(struct btree *bt); 157 | static void mpage_dirty(struct btree *bt, struct mpage *mp); 158 | static struct mpage *mpage_touch(struct btree *bt, struct mpage *mp); 159 | 160 | RB_PROTOTYPE(page_cache, mpage, entry, mpage_cmp); 161 | RB_GENERATE(page_cache, mpage, entry, mpage_cmp); 162 | 163 | struct ppage { /* ordered list of pages */ 164 | SLIST_ENTRY(ppage) entry; 165 | struct mpage *mpage; 166 | unsigned int ki; /* cursor index on page */ 167 | }; 168 | SLIST_HEAD(page_stack, ppage); 169 | 170 | #define CURSOR_EMPTY(c) SLIST_EMPTY(&(c)->stack) 171 | #define CURSOR_TOP(c) SLIST_FIRST(&(c)->stack) 172 | #define CURSOR_POP(c) SLIST_REMOVE_HEAD(&(c)->stack, entry) 173 | #define CURSOR_PUSH(c,p) SLIST_INSERT_HEAD(&(c)->stack, p, entry) 174 | 175 | struct cursor { 176 | struct btree *bt; 177 | struct btree_txn *txn; 178 | struct page_stack stack; /* stack of parent pages */ 179 | short initialized; /* 1 if initialized */ 180 | short eof; /* 1 if end is reached */ 181 | }; 182 | 183 | #define METAHASHLEN offsetof(struct bt_meta, hash) 184 | #define METADATA(p) ((void *)((char *)p + PAGEHDRSZ)) 185 | 186 | struct node { 187 | #define n_pgno p.np_pgno 188 | #define n_dsize p.np_dsize 189 | union { 190 | pgno_t np_pgno; /* child page number */ 191 | uint32_t np_dsize; /* leaf data size */ 192 | } p; 193 | uint16_t ksize; /* key size */ 194 | #define F_BIGDATA 0x01 /* data put on overflow page */ 195 | uint8_t flags; 196 | char data[1]; 197 | } __packed; 198 | 199 | struct btree_txn { 200 | pgno_t root; /* current / new root page */ 201 | pgno_t next_pgno; /* next unallocated page */ 202 | struct btree *bt; /* btree is ref'd */ 203 | struct dirty_queue *dirty_queue; /* modified pages */ 204 | #define BT_TXN_RDONLY 0x01 /* read-only transaction */ 205 | #define BT_TXN_ERROR 0x02 /* an error has occurred */ 206 | unsigned int flags; 207 | }; 208 | 209 | struct btree { 210 | int fd; 211 | char *path; 212 | #define BT_FIXPADDING 0x01 /* internal */ 213 | unsigned int flags; 214 | bt_cmp_func cmp; /* user compare function */ 215 | struct bt_head head; 216 | struct bt_meta meta; 217 | struct page_cache *page_cache; 218 | struct lru_queue *lru_queue; 219 | struct btree_txn *txn; /* current write transaction */ 220 | int ref; /* increased by cursors & txn */ 221 | struct btree_stat stat; 222 | off_t size; /* current file size */ 223 | }; 224 | 225 | #define NODESIZE offsetof(struct node, data) 226 | 227 | #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->size)) 228 | #define LEAFSIZE(k, d) (NODESIZE + (k)->size + (d)->size) 229 | #define NODEPTRP(p, i) ((struct node *)((char *)(p) + (p)->ptrs[i])) 230 | #define NODEPTR(mp, i) NODEPTRP((mp)->page, i) 231 | #define NODEKEY(node) (void *)((node)->data) 232 | #define NODEDATA(node) (void *)((char *)(node)->data + (node)->ksize) 233 | #define NODEPGNO(node) ((node)->p.np_pgno) 234 | #define NODEDSZ(node) ((node)->p.np_dsize) 235 | 236 | #define BT_COMMIT_PAGES 64 /* max number of pages to write in one commit */ 237 | #define BT_MAXCACHE_DEF 1024 /* max number of pages to keep in cache */ 238 | 239 | static int btree_read_page(struct btree *bt, pgno_t pgno, 240 | struct page *page); 241 | static struct mpage *btree_get_mpage(struct btree *bt, pgno_t pgno); 242 | static int btree_search_page_root(struct btree *bt, 243 | struct mpage *root, struct btval *key, 244 | struct cursor *cursor, int modify, 245 | struct mpage **mpp); 246 | static int btree_search_page(struct btree *bt, 247 | struct btree_txn *txn, struct btval *key, 248 | struct cursor *cursor, int modify, 249 | struct mpage **mpp); 250 | 251 | static int btree_write_header(struct btree *bt, int fd); 252 | static int btree_read_header(struct btree *bt); 253 | static int btree_is_meta_page(struct page *p); 254 | static int btree_read_meta(struct btree *bt, pgno_t *p_next); 255 | static int btree_write_meta(struct btree *bt, pgno_t root, 256 | unsigned int flags); 257 | static void btree_ref(struct btree *bt); 258 | 259 | static struct node *btree_search_node(struct btree *bt, struct mpage *mp, 260 | struct btval *key, int *exactp, unsigned int *kip); 261 | static int btree_add_node(struct btree *bt, struct mpage *mp, 262 | indx_t indx, struct btval *key, struct btval *data, 263 | pgno_t pgno, uint8_t flags); 264 | static void btree_del_node(struct mpage *mp, 265 | indx_t indx); 266 | static int btree_read_data(struct btree *bt, struct mpage *mp, 267 | struct node *leaf, struct btval *data); 268 | 269 | static int btree_rebalance(struct btree *bt, struct mpage *mp); 270 | static int btree_update_key(struct mpage *mp, 271 | indx_t indx, struct btval *key); 272 | static int btree_adjust_prefix(struct btree *bt, 273 | struct mpage *src, int delta); 274 | static int btree_move_node(struct btree *bt, struct mpage *src, 275 | indx_t srcindx, struct mpage *dst, indx_t dstindx); 276 | static int btree_merge(struct btree *bt, struct mpage *src, 277 | struct mpage *dst); 278 | static int btree_split(struct btree *bt, struct mpage **mpp, 279 | unsigned int *newindxp, struct btval *newkey, 280 | struct btval *newdata, pgno_t newpgno); 281 | static struct mpage *btree_new_page(struct btree *bt, uint32_t flags); 282 | static int btree_write_overflow_data(struct btree *bt, 283 | struct page *p, struct btval *data); 284 | 285 | static void cursor_pop_page(struct cursor *cursor); 286 | static struct ppage *cursor_push_page(struct cursor *cursor, 287 | struct mpage *mp); 288 | 289 | static int bt_set_key(struct btree *bt, struct mpage *mp, 290 | struct node *node, struct btval *key); 291 | static int btree_sibling(struct cursor *cursor, int move_right); 292 | static int btree_cursor_next(struct cursor *cursor, 293 | struct btval *key, struct btval *data); 294 | static int btree_cursor_set(struct cursor *cursor, 295 | struct btval *key, struct btval *data, int *exactp); 296 | static int btree_cursor_first(struct cursor *cursor, 297 | struct btval *key, struct btval *data); 298 | 299 | static void bt_reduce_separator(struct btree *bt, struct node *min, 300 | struct btval *sep); 301 | static void remove_prefix(struct btree *bt, struct btval *key, 302 | size_t pfxlen); 303 | static void expand_prefix(struct btree *bt, struct mpage *mp, 304 | indx_t indx, struct btkey *expkey); 305 | static void concat_prefix(struct btree *bt, char *s1, size_t n1, 306 | char *s2, size_t n2, char *cs, size_t *cn); 307 | static void common_prefix(struct btree *bt, struct btkey *min, 308 | struct btkey *max, struct btkey *pfx); 309 | static void find_common_prefix(struct btree *bt, struct mpage *mp); 310 | 311 | static size_t bt_leaf_size(struct btree *bt, struct btval *key, 312 | struct btval *data); 313 | static size_t bt_branch_size(struct btree *bt, struct btval *key); 314 | 315 | static pgno_t btree_compact_tree(struct btree *bt, pgno_t pgno, 316 | struct btree *btc); 317 | 318 | static int memncmp(const void *s1, size_t n1, 319 | const void *s2, size_t n2); 320 | static int memnrcmp(const void *s1, size_t n1, 321 | const void *s2, size_t n2); 322 | 323 | static int 324 | memncmp(const void *s1, size_t n1, const void *s2, size_t n2) 325 | { 326 | if (n1 < n2) { 327 | if (memcmp(s1, s2, n1) == 0) 328 | return -1; 329 | } 330 | else if (n1 > n2) { 331 | if (memcmp(s1, s2, n2) == 0) 332 | return 1; 333 | } 334 | return memcmp(s1, s2, n1); 335 | } 336 | 337 | static int 338 | memnrcmp(const void *s1, size_t n1, const void *s2, size_t n2) 339 | { 340 | const unsigned char *p1; 341 | const unsigned char *p2; 342 | 343 | if (n1 == 0) 344 | return n2 == 0 ? 0 : -1; 345 | 346 | if (n2 == 0) 347 | return n1 == 0 ? 0 : 1; 348 | 349 | p1 = (const unsigned char *)s1 + n1 - 1; 350 | p2 = (const unsigned char *)s2 + n2 - 1; 351 | 352 | while (*p1 == *p2) { 353 | if (p1 == s1) 354 | return (p2 == s2) ? 0 : -1; 355 | if (p2 == s2) 356 | return (p1 == p2) ? 0 : 1; 357 | p1--; 358 | p2--; 359 | } 360 | return *p1 - *p2; 361 | } 362 | 363 | int 364 | btree_cmp(struct btree *bt, const struct btval *a, const struct btval *b) 365 | { 366 | return bt->cmp(a, b); 367 | } 368 | 369 | static void 370 | common_prefix(struct btree *bt, struct btkey *min, struct btkey *max, 371 | struct btkey *pfx) 372 | { 373 | size_t n = 0; 374 | char *p1; 375 | char *p2; 376 | 377 | if (min->len == 0 || max->len == 0) { 378 | pfx->len = 0; 379 | return; 380 | } 381 | 382 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 383 | p1 = min->str + min->len - 1; 384 | p2 = max->str + max->len - 1; 385 | 386 | while (*p1 == *p2) { 387 | if (p1 < min->str || p2 < max->str) 388 | break; 389 | p1--; 390 | p2--; 391 | n++; 392 | } 393 | 394 | assert(n <= (int)sizeof(pfx->str)); 395 | pfx->len = n; 396 | memcpy(pfx->str, p2 + 1, n); 397 | } else { 398 | p1 = min->str; 399 | p2 = max->str; 400 | 401 | while (*p1 == *p2) { 402 | if (n == min->len || n == max->len) 403 | break; 404 | p1++; 405 | p2++; 406 | n++; 407 | } 408 | 409 | assert(n <= (int)sizeof(pfx->str)); 410 | pfx->len = n; 411 | memcpy(pfx->str, max->str, n); 412 | } 413 | } 414 | 415 | static void 416 | remove_prefix(struct btree *bt, struct btval *key, size_t pfxlen) 417 | { 418 | if (pfxlen == 0 || bt->cmp != NULL) 419 | return; 420 | 421 | DPRINTF("removing %zu bytes of prefix from key [%.*s]", pfxlen, 422 | (int)key->size, (char *)key->data); 423 | assert(pfxlen <= key->size); 424 | key->size -= pfxlen; 425 | if (!F_ISSET(bt->flags, BT_REVERSEKEY)) 426 | key->data = (char *)key->data + pfxlen; 427 | } 428 | 429 | static void 430 | expand_prefix(struct btree *bt, struct mpage *mp, indx_t indx, 431 | struct btkey *expkey) 432 | { 433 | struct node *node; 434 | 435 | node = NODEPTR(mp, indx); 436 | expkey->len = sizeof(expkey->str); 437 | concat_prefix(bt, mp->prefix.str, mp->prefix.len, 438 | NODEKEY(node), node->ksize, expkey->str, &expkey->len); 439 | } 440 | 441 | static int 442 | bt_cmp(struct btree *bt, const struct btval *key1, const struct btval *key2, 443 | struct btkey *pfx) 444 | { 445 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) 446 | return memnrcmp(key1->data, key1->size - pfx->len, 447 | key2->data, key2->size); 448 | else 449 | return memncmp((char *)key1->data + pfx->len, key1->size - pfx->len, 450 | key2->data, key2->size); 451 | } 452 | 453 | void 454 | btval_reset(struct btval *btv) 455 | { 456 | if (btv) { 457 | if (btv->mp) 458 | btv->mp->ref--; 459 | if (btv->free_data) 460 | free(btv->data); 461 | memset(btv, 0, sizeof(*btv)); 462 | } 463 | } 464 | 465 | static int 466 | mpage_cmp(struct mpage *a, struct mpage *b) 467 | { 468 | if (a->pgno > b->pgno) 469 | return 1; 470 | if (a->pgno < b->pgno) 471 | return -1; 472 | return 0; 473 | } 474 | 475 | static struct mpage * 476 | mpage_lookup(struct btree *bt, pgno_t pgno) 477 | { 478 | struct mpage find, *mp; 479 | 480 | find.pgno = pgno; 481 | mp = RB_FIND(page_cache, bt->page_cache, &find); 482 | if (mp) { 483 | bt->stat.hits++; 484 | /* Update LRU queue. Move page to the end. */ 485 | TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 486 | TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 487 | } 488 | return mp; 489 | } 490 | 491 | static void 492 | mpage_add(struct btree *bt, struct mpage *mp) 493 | { 494 | assert(RB_INSERT(page_cache, bt->page_cache, mp) == NULL); 495 | bt->stat.cache_size++; 496 | TAILQ_INSERT_TAIL(bt->lru_queue, mp, lru_next); 497 | } 498 | 499 | static void 500 | mpage_free(struct mpage *mp) 501 | { 502 | if (mp != NULL) { 503 | free(mp->page); 504 | free(mp); 505 | } 506 | } 507 | 508 | static void 509 | mpage_del(struct btree *bt, struct mpage *mp) 510 | { 511 | assert(RB_REMOVE(page_cache, bt->page_cache, mp) == mp); 512 | assert(bt->stat.cache_size > 0); 513 | bt->stat.cache_size--; 514 | TAILQ_REMOVE(bt->lru_queue, mp, lru_next); 515 | } 516 | 517 | static void 518 | mpage_flush(struct btree *bt) 519 | { 520 | struct mpage *mp; 521 | 522 | while ((mp = RB_MIN(page_cache, bt->page_cache)) != NULL) { 523 | mpage_del(bt, mp); 524 | mpage_free(mp); 525 | } 526 | } 527 | 528 | static struct mpage * 529 | mpage_copy(struct btree *bt, struct mpage *mp) 530 | { 531 | struct mpage *copy; 532 | 533 | if ((copy = calloc(1, sizeof(*copy))) == NULL) 534 | return NULL; 535 | if ((copy->page = malloc(bt->head.psize)) == NULL) { 536 | free(copy); 537 | return NULL; 538 | } 539 | memcpy(copy->page, mp->page, bt->head.psize); 540 | memcpy(©->prefix, &mp->prefix, sizeof(mp->prefix)); 541 | copy->parent = mp->parent; 542 | copy->parent_index = mp->parent_index; 543 | copy->pgno = mp->pgno; 544 | 545 | return copy; 546 | } 547 | 548 | /* Remove the least recently used memory pages until the cache size is 549 | * within the configured bounds. Pages referenced by cursors or returned 550 | * key/data are not pruned. 551 | */ 552 | static void 553 | mpage_prune(struct btree *bt) 554 | { 555 | struct mpage *mp, *next; 556 | 557 | for (mp = TAILQ_FIRST(bt->lru_queue); mp; mp = next) { 558 | if (bt->stat.cache_size <= bt->stat.max_cache) 559 | break; 560 | next = TAILQ_NEXT(mp, lru_next); 561 | if (!mp->dirty && mp->ref <= 0) { 562 | mpage_del(bt, mp); 563 | mpage_free(mp); 564 | } 565 | } 566 | } 567 | 568 | /* Mark a page as dirty and push it on the dirty queue. 569 | */ 570 | static void 571 | mpage_dirty(struct btree *bt, struct mpage *mp) 572 | { 573 | assert(bt != NULL); 574 | assert(bt->txn != NULL); 575 | 576 | if (!mp->dirty) { 577 | mp->dirty = 1; 578 | SIMPLEQ_INSERT_TAIL(bt->txn->dirty_queue, mp, next); 579 | } 580 | } 581 | 582 | /* Touch a page: make it dirty and re-insert into tree with updated pgno. 583 | */ 584 | static struct mpage * 585 | mpage_touch(struct btree *bt, struct mpage *mp) 586 | { 587 | assert(bt != NULL); 588 | assert(bt->txn != NULL); 589 | assert(mp != NULL); 590 | 591 | if (!mp->dirty) { 592 | DPRINTF("touching page %u -> %u", mp->pgno, bt->txn->next_pgno); 593 | if (mp->ref == 0) 594 | mpage_del(bt, mp); 595 | else { 596 | if ((mp = mpage_copy(bt, mp)) == NULL) 597 | return NULL; 598 | } 599 | mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 600 | mpage_dirty(bt, mp); 601 | mpage_add(bt, mp); 602 | 603 | /* Update the page number to new touched page. */ 604 | if (mp->parent != NULL) 605 | NODEPGNO(NODEPTR(mp->parent, 606 | mp->parent_index)) = mp->pgno; 607 | } 608 | 609 | return mp; 610 | } 611 | 612 | static int 613 | btree_read_page(struct btree *bt, pgno_t pgno, struct page *page) 614 | { 615 | ssize_t rc; 616 | 617 | DPRINTF("reading page %u", pgno); 618 | bt->stat.reads++; 619 | if ((rc = pread(bt->fd, page, bt->head.psize, (off_t)pgno*bt->head.psize)) == 0) { 620 | DPRINTF("page %u doesn't exist", pgno); 621 | errno = ENOENT; 622 | return BT_FAIL; 623 | } else if (rc != (ssize_t)bt->head.psize) { 624 | if (rc > 0) 625 | errno = EINVAL; 626 | DPRINTF("read: %s", strerror(errno)); 627 | return BT_FAIL; 628 | } 629 | 630 | if (page->pgno != pgno) { 631 | DPRINTF("page numbers don't match: %u != %u", pgno, page->pgno); 632 | errno = EINVAL; 633 | return BT_FAIL; 634 | } 635 | 636 | DPRINTF("page %u has flags 0x%X", pgno, page->flags); 637 | 638 | return BT_SUCCESS; 639 | } 640 | 641 | int 642 | btree_sync(struct btree *bt) 643 | { 644 | if (!F_ISSET(bt->flags, BT_NOSYNC)) 645 | return fsync(bt->fd); 646 | return 0; 647 | } 648 | 649 | struct btree_txn * 650 | btree_txn_begin(struct btree *bt, int rdonly) 651 | { 652 | struct btree_txn *txn; 653 | 654 | if (!rdonly && bt->txn != NULL) { 655 | DPRINTF("write transaction already begun"); 656 | errno = EBUSY; 657 | return NULL; 658 | } 659 | 660 | if ((txn = calloc(1, sizeof(*txn))) == NULL) { 661 | DPRINTF("calloc: %s", strerror(errno)); 662 | return NULL; 663 | } 664 | 665 | if (rdonly) { 666 | txn->flags |= BT_TXN_RDONLY; 667 | } else { 668 | txn->dirty_queue = calloc(1, sizeof(*txn->dirty_queue)); 669 | if (txn->dirty_queue == NULL) { 670 | free(txn); 671 | return NULL; 672 | } 673 | SIMPLEQ_INIT(txn->dirty_queue); 674 | 675 | DPRINTF("taking write lock on txn %p", txn); 676 | if (flock(bt->fd, LOCK_EX | LOCK_NB) != 0) { 677 | DPRINTF("flock: %s", strerror(errno)); 678 | errno = EBUSY; 679 | free(txn->dirty_queue); 680 | free(txn); 681 | return NULL; 682 | } 683 | bt->txn = txn; 684 | } 685 | 686 | txn->bt = bt; 687 | btree_ref(bt); 688 | 689 | if (btree_read_meta(bt, &txn->next_pgno) != BT_SUCCESS) { 690 | btree_txn_abort(txn); 691 | return NULL; 692 | } 693 | 694 | txn->root = bt->meta.root; 695 | DPRINTF("begin transaction on btree %p, root page %u", bt, txn->root); 696 | 697 | return txn; 698 | } 699 | 700 | void 701 | btree_txn_abort(struct btree_txn *txn) 702 | { 703 | struct mpage *mp; 704 | struct btree *bt; 705 | 706 | if (txn == NULL) 707 | return; 708 | 709 | bt = txn->bt; 710 | DPRINTF("abort transaction on btree %p, root page %u", bt, txn->root); 711 | 712 | if (!F_ISSET(txn->flags, BT_TXN_RDONLY)) { 713 | /* Discard all dirty pages. 714 | */ 715 | while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 716 | mp = SIMPLEQ_FIRST(txn->dirty_queue); 717 | assert(mp->ref == 0); /* cursors should be closed */ 718 | mpage_del(bt, mp); 719 | SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 720 | mpage_free(mp); 721 | } 722 | 723 | DPRINTF("releasing write lock on txn %p", txn); 724 | txn->bt->txn = NULL; 725 | if (flock(txn->bt->fd, LOCK_UN) != 0) { 726 | DPRINTF("failed to unlock fd %d: %s", 727 | txn->bt->fd, strerror(errno)); 728 | } 729 | free(txn->dirty_queue); 730 | } 731 | 732 | btree_close(txn->bt); 733 | free(txn); 734 | } 735 | 736 | int 737 | btree_txn_commit(struct btree_txn *txn) 738 | { 739 | int n, done; 740 | ssize_t rc; 741 | off_t size; 742 | struct mpage *mp; 743 | struct btree *bt; 744 | struct iovec iov[BT_COMMIT_PAGES]; 745 | 746 | assert(txn != NULL); 747 | assert(txn->bt != NULL); 748 | 749 | bt = txn->bt; 750 | 751 | if (F_ISSET(txn->flags, BT_TXN_RDONLY)) { 752 | DPRINTF("attempt to commit read-only transaction"); 753 | btree_txn_abort(txn); 754 | errno = EPERM; 755 | return BT_FAIL; 756 | } 757 | 758 | if (txn != bt->txn) { 759 | DPRINTF("attempt to commit unknown transaction"); 760 | btree_txn_abort(txn); 761 | errno = EINVAL; 762 | return BT_FAIL; 763 | } 764 | 765 | if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 766 | DPRINTF("error flag is set, can't commit"); 767 | btree_txn_abort(txn); 768 | errno = EINVAL; 769 | return BT_FAIL; 770 | } 771 | 772 | if (SIMPLEQ_EMPTY(txn->dirty_queue)) 773 | goto done; 774 | 775 | if (F_ISSET(bt->flags, BT_FIXPADDING)) { 776 | size = lseek(bt->fd, 0, SEEK_END); 777 | size += bt->head.psize - (size % bt->head.psize); 778 | DPRINTF("extending to multiple of page size: %llu", size); 779 | if (ftruncate(bt->fd, size) != 0) { 780 | DPRINTF("ftruncate: %s", strerror(errno)); 781 | btree_txn_abort(txn); 782 | return BT_FAIL; 783 | } 784 | bt->flags &= ~BT_FIXPADDING; 785 | } 786 | 787 | DPRINTF("committing transaction on btree %p, root page %u", 788 | bt, txn->root); 789 | 790 | /* Commit up to BT_COMMIT_PAGES dirty pages to disk until done. 791 | */ 792 | do { 793 | n = 0; 794 | done = 1; 795 | SIMPLEQ_FOREACH(mp, txn->dirty_queue, next) { 796 | DPRINTF("commiting page %u", mp->pgno); 797 | iov[n].iov_len = bt->head.psize; 798 | iov[n].iov_base = mp->page; 799 | if (++n >= BT_COMMIT_PAGES) { 800 | done = 0; 801 | break; 802 | } 803 | } 804 | 805 | if (n == 0) 806 | break; 807 | 808 | DPRINTF("commiting %u dirty pages", n); 809 | rc = writev(bt->fd, iov, n); 810 | if (rc != (ssize_t)bt->head.psize*n) { 811 | if (rc > 0) 812 | DPRINTF("short write, filesystem full?"); 813 | else 814 | DPRINTF("writev: %s", strerror(errno)); 815 | btree_txn_abort(txn); 816 | return BT_FAIL; 817 | } 818 | 819 | /* Remove the dirty flag from the written pages. 820 | */ 821 | while (!SIMPLEQ_EMPTY(txn->dirty_queue)) { 822 | mp = SIMPLEQ_FIRST(txn->dirty_queue); 823 | mp->dirty = 0; 824 | SIMPLEQ_REMOVE_HEAD(txn->dirty_queue, next); 825 | if (--n == 0) 826 | break; 827 | } 828 | } while (!done); 829 | 830 | if (btree_sync(bt) != 0 || 831 | btree_write_meta(bt, txn->root, 0) != BT_SUCCESS || 832 | btree_sync(bt) != 0) { 833 | btree_txn_abort(txn); 834 | return BT_FAIL; 835 | } 836 | 837 | done: 838 | mpage_prune(bt); 839 | btree_txn_abort(txn); 840 | 841 | return BT_SUCCESS; 842 | } 843 | 844 | static int 845 | btree_write_header(struct btree *bt, int fd) 846 | { 847 | struct stat sb; 848 | struct bt_head *h; 849 | struct page *p; 850 | ssize_t rc; 851 | unsigned int psize; 852 | 853 | DPRINTF("writing header page"); 854 | assert(bt != NULL); 855 | 856 | /* Ask stat for 'optimal blocksize for I/O'. 857 | */ 858 | if (fstat(fd, &sb) == 0) 859 | psize = sb.st_blksize; 860 | else 861 | psize = PAGESIZE; 862 | 863 | if ((p = calloc(1, psize)) == NULL) 864 | return -1; 865 | p->flags = P_HEAD; 866 | 867 | h = METADATA(p); 868 | h->magic = BT_MAGIC; 869 | h->version = BT_VERSION; 870 | h->psize = psize; 871 | memcpy(&bt->head, h, sizeof(*h)); 872 | 873 | rc = write(fd, p, bt->head.psize); 874 | free(p); 875 | if (rc != (ssize_t)bt->head.psize) { 876 | if (rc > 0) 877 | DPRINTF("short write, filesystem full?"); 878 | return BT_FAIL; 879 | } 880 | 881 | return BT_SUCCESS; 882 | } 883 | 884 | static int 885 | btree_read_header(struct btree *bt) 886 | { 887 | char page[PAGESIZE]; 888 | struct page *p; 889 | struct bt_head *h; 890 | int rc; 891 | 892 | assert(bt != NULL); 893 | 894 | /* We don't know the page size yet, so use a minimum value. 895 | */ 896 | 897 | if ((rc = pread(bt->fd, page, PAGESIZE, 0)) == 0) { 898 | errno = ENOENT; 899 | return -1; 900 | } else if (rc != PAGESIZE) { 901 | if (rc > 0) 902 | errno = EINVAL; 903 | DPRINTF("read: %s", strerror(errno)); 904 | return -1; 905 | } 906 | 907 | p = (struct page *)page; 908 | 909 | if (!F_ISSET(p->flags, P_HEAD)) { 910 | DPRINTF("page %d not a header page", p->pgno); 911 | errno = EINVAL; 912 | return -1; 913 | } 914 | 915 | h = METADATA(p); 916 | if (h->magic != BT_MAGIC) { 917 | DPRINTF("header has invalid magic"); 918 | errno = EINVAL; 919 | return -1; 920 | } 921 | 922 | if (h->version != BT_VERSION) { 923 | DPRINTF("database is version %u, expected version %u", 924 | bt->head.version, BT_VERSION); 925 | errno = EINVAL; 926 | return -1; 927 | } 928 | 929 | memcpy(&bt->head, h, sizeof(*h)); 930 | return 0; 931 | } 932 | 933 | static int 934 | btree_write_meta(struct btree *bt, pgno_t root, unsigned int flags) 935 | { 936 | struct mpage *mp; 937 | struct bt_meta *meta; 938 | ssize_t rc; 939 | 940 | DPRINTF("writing meta page for root page %u", root); 941 | 942 | assert(bt != NULL); 943 | assert(bt->txn != NULL); 944 | 945 | if ((mp = btree_new_page(bt, P_META)) == NULL) 946 | return -1; 947 | 948 | bt->meta.prev_meta = bt->meta.root; 949 | bt->meta.root = root; 950 | bt->meta.flags = flags; 951 | bt->meta.created_at = time(0); 952 | bt->meta.revisions++; 953 | sha1((unsigned char *)&bt->meta, METAHASHLEN, bt->meta.hash); 954 | 955 | /* Copy the meta data changes to the new meta page. */ 956 | meta = METADATA(mp->page); 957 | memcpy(meta, &bt->meta, sizeof(*meta)); 958 | 959 | rc = write(bt->fd, mp->page, bt->head.psize); 960 | mp->dirty = 0; 961 | SIMPLEQ_REMOVE_HEAD(bt->txn->dirty_queue, next); 962 | if (rc != (ssize_t)bt->head.psize) { 963 | if (rc > 0) 964 | DPRINTF("short write, filesystem full?"); 965 | return BT_FAIL; 966 | } 967 | 968 | if ((bt->size = lseek(bt->fd, 0, SEEK_END)) == -1) { 969 | DPRINTF("failed to update file size: %s", strerror(errno)); 970 | bt->size = 0; 971 | } 972 | 973 | return BT_SUCCESS; 974 | } 975 | 976 | /* Returns true if page p is a valid meta page, false otherwise. 977 | */ 978 | static int 979 | btree_is_meta_page(struct page *p) 980 | { 981 | struct bt_meta *m; 982 | unsigned char hash[SHA1_DIGEST_LENGTH]; 983 | 984 | m = METADATA(p); 985 | if (!F_ISSET(p->flags, P_META)) { 986 | DPRINTF("page %d not a meta page", p->pgno); 987 | errno = EINVAL; 988 | return 0; 989 | } 990 | 991 | if (m->root >= p->pgno && m->root != P_INVALID) { 992 | DPRINTF("page %d points to an invalid root page", p->pgno); 993 | errno = EINVAL; 994 | return 0; 995 | } 996 | 997 | sha1((unsigned char *)m, METAHASHLEN, hash); 998 | if (bcmp(hash, m->hash, SHA1_DIGEST_LENGTH) != 0) { 999 | DPRINTF("page %d has an invalid digest", p->pgno); 1000 | errno = EINVAL; 1001 | return 0; 1002 | } 1003 | 1004 | return 1; 1005 | } 1006 | 1007 | static int 1008 | btree_read_meta(struct btree *bt, pgno_t *p_next) 1009 | { 1010 | struct mpage *mp; 1011 | struct bt_meta *meta; 1012 | pgno_t meta_pgno, next_pgno; 1013 | off_t size; 1014 | 1015 | assert(bt != NULL); 1016 | 1017 | if ((size = lseek(bt->fd, 0, SEEK_END)) == -1) 1018 | goto fail; 1019 | 1020 | DPRINTF("btree_read_meta: size = %llu", size); 1021 | 1022 | if (size < bt->size) { 1023 | DPRINTF("file has shrunk!"); 1024 | errno = EIO; 1025 | goto fail; 1026 | } 1027 | 1028 | if (size == bt->head.psize) { /* there is only the header */ 1029 | if (p_next != NULL) 1030 | *p_next = 1; 1031 | return BT_SUCCESS; /* new file */ 1032 | } 1033 | 1034 | next_pgno = size / bt->head.psize; 1035 | if (next_pgno == 0) { 1036 | DPRINTF("corrupt file"); 1037 | errno = EIO; 1038 | goto fail; 1039 | } 1040 | 1041 | meta_pgno = next_pgno - 1; 1042 | 1043 | if (size % bt->head.psize != 0) { 1044 | DPRINTF("filesize not a multiple of the page size!"); 1045 | bt->flags |= BT_FIXPADDING; 1046 | next_pgno++; 1047 | } 1048 | 1049 | if (p_next != NULL) 1050 | *p_next = next_pgno; 1051 | 1052 | if (size == bt->size) { 1053 | DPRINTF("size unchanged, keeping current meta page"); 1054 | if (F_ISSET(bt->meta.flags, BT_TOMBSTONE)) { 1055 | DPRINTF("file is dead"); 1056 | errno = ESTALE; 1057 | return BT_FAIL; 1058 | } else 1059 | return BT_SUCCESS; 1060 | } 1061 | bt->size = size; 1062 | 1063 | while (meta_pgno > 0) { 1064 | if ((mp = btree_get_mpage(bt, meta_pgno)) == NULL) 1065 | break; 1066 | if (btree_is_meta_page(mp->page)) { 1067 | meta = METADATA(mp->page); 1068 | DPRINTF("flags = 0x%x", meta->flags); 1069 | if (F_ISSET(meta->flags, BT_TOMBSTONE)) { 1070 | DPRINTF("file is dead"); 1071 | errno = ESTALE; 1072 | return BT_FAIL; 1073 | } else { 1074 | /* Make copy of last meta page. */ 1075 | memcpy(&bt->meta, meta, sizeof(bt->meta)); 1076 | return BT_SUCCESS; 1077 | } 1078 | } 1079 | --meta_pgno; /* scan backwards to first valid meta page */ 1080 | } 1081 | 1082 | errno = EIO; 1083 | fail: 1084 | if (p_next != NULL) 1085 | *p_next = P_INVALID; 1086 | return BT_FAIL; 1087 | } 1088 | 1089 | struct btree * 1090 | btree_open_fd(int fd, unsigned int flags) 1091 | { 1092 | struct btree *bt; 1093 | int fl; 1094 | 1095 | fl = fcntl(fd, F_GETFL, 0); 1096 | if (fcntl(fd, F_SETFL, fl | O_APPEND) == -1) 1097 | return NULL; 1098 | 1099 | if ((bt = calloc(1, sizeof(*bt))) == NULL) 1100 | return NULL; 1101 | bt->fd = fd; 1102 | bt->flags = flags; 1103 | bt->flags &= ~BT_FIXPADDING; 1104 | bt->ref = 1; 1105 | bt->meta.root = P_INVALID; 1106 | 1107 | if ((bt->page_cache = calloc(1, sizeof(*bt->page_cache))) == NULL) 1108 | goto fail; 1109 | bt->stat.max_cache = BT_MAXCACHE_DEF; 1110 | RB_INIT(bt->page_cache); 1111 | 1112 | if ((bt->lru_queue = calloc(1, sizeof(*bt->lru_queue))) == NULL) 1113 | goto fail; 1114 | TAILQ_INIT(bt->lru_queue); 1115 | 1116 | if (btree_read_header(bt) != 0) { 1117 | if (errno != ENOENT) 1118 | goto fail; 1119 | DPRINTF("new database"); 1120 | btree_write_header(bt, bt->fd); 1121 | } 1122 | 1123 | if (btree_read_meta(bt, NULL) != 0) 1124 | goto fail; 1125 | 1126 | DPRINTF("opened database version %u, pagesize %u", 1127 | bt->head.version, bt->head.psize); 1128 | DPRINTF("timestamp: %s", ctime(&bt->meta.created_at)); 1129 | DPRINTF("depth: %u", bt->meta.depth); 1130 | DPRINTF("entries: %llu", bt->meta.entries); 1131 | DPRINTF("revisions: %u", bt->meta.revisions); 1132 | DPRINTF("branch pages: %u", bt->meta.branch_pages); 1133 | DPRINTF("leaf pages: %u", bt->meta.leaf_pages); 1134 | DPRINTF("overflow pages: %u", bt->meta.overflow_pages); 1135 | DPRINTF("root: %u", bt->meta.root); 1136 | DPRINTF("previous meta page: %u", bt->meta.prev_meta); 1137 | 1138 | return bt; 1139 | 1140 | fail: 1141 | free(bt->lru_queue); 1142 | free(bt->page_cache); 1143 | free(bt); 1144 | return NULL; 1145 | } 1146 | 1147 | struct btree * 1148 | btree_open(const char *path, unsigned int flags, mode_t mode) 1149 | { 1150 | int fd, oflags; 1151 | struct btree *bt; 1152 | 1153 | if (F_ISSET(flags, BT_RDONLY)) 1154 | oflags = O_RDONLY; 1155 | else 1156 | oflags = O_RDWR | O_CREAT | O_APPEND; 1157 | 1158 | if ((fd = open(path, oflags, mode)) == -1) 1159 | return NULL; 1160 | 1161 | if ((bt = btree_open_fd(fd, flags)) == NULL) 1162 | close(fd); 1163 | else { 1164 | bt->path = strdup(path); 1165 | DPRINTF("opened btree %p", bt); 1166 | } 1167 | 1168 | return bt; 1169 | } 1170 | 1171 | static void 1172 | btree_ref(struct btree *bt) 1173 | { 1174 | bt->ref++; 1175 | DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1176 | } 1177 | 1178 | void 1179 | btree_close(struct btree *bt) 1180 | { 1181 | if (bt == NULL) 1182 | return; 1183 | 1184 | if (--bt->ref == 0) { 1185 | DPRINTF("ref is zero, closing btree %p", bt); 1186 | close(bt->fd); 1187 | mpage_flush(bt); 1188 | free(bt->lru_queue); 1189 | free(bt->path); 1190 | free(bt->page_cache); 1191 | free(bt); 1192 | } else 1193 | DPRINTF("ref is now %d on btree %p", bt->ref, bt); 1194 | } 1195 | 1196 | /* Search for key within a leaf page, using binary search. 1197 | * Returns the smallest entry larger or equal to the key. 1198 | * If exactp is non-null, stores whether the found entry was an exact match 1199 | * in *exactp (1 or 0). 1200 | * If kip is non-null, stores the index of the found entry in *kip. 1201 | * If no entry larger of equal to the key is found, returns NULL. 1202 | */ 1203 | static struct node * 1204 | btree_search_node(struct btree *bt, struct mpage *mp, struct btval *key, 1205 | int *exactp, unsigned int *kip) 1206 | { 1207 | unsigned int i = 0; 1208 | int low, high; 1209 | int rc = 0; 1210 | struct node *node; 1211 | struct btval nodekey; 1212 | 1213 | DPRINTF("searching %lu keys in %s page %u with prefix [%.*s]", 1214 | NUMKEYS(mp), 1215 | IS_LEAF(mp) ? "leaf" : "branch", 1216 | mp->pgno, (int)mp->prefix.len, (char *)mp->prefix.str); 1217 | 1218 | assert(NUMKEYS(mp) > 0); 1219 | 1220 | memset(&nodekey, 0, sizeof(nodekey)); 1221 | 1222 | low = IS_LEAF(mp) ? 0 : 1; 1223 | high = NUMKEYS(mp) - 1; 1224 | while (low <= high) { 1225 | i = (low + high) >> 1; 1226 | node = NODEPTR(mp, i); 1227 | 1228 | nodekey.size = node->ksize; 1229 | nodekey.data = NODEKEY(node); 1230 | 1231 | if (bt->cmp) 1232 | rc = bt->cmp(key, &nodekey); 1233 | else 1234 | rc = bt_cmp(bt, key, &nodekey, &mp->prefix); 1235 | 1236 | if (IS_LEAF(mp)) 1237 | DPRINTF("found leaf index %u [%.*s], rc = %i", 1238 | i, (int)nodekey.size, (char *)nodekey.data, rc); 1239 | else 1240 | DPRINTF("found branch index %u [%.*s -> %u], rc = %i", 1241 | i, (int)node->ksize, (char *)NODEKEY(node), 1242 | node->n_pgno, rc); 1243 | 1244 | if (rc == 0) 1245 | break; 1246 | if (rc > 0) 1247 | low = i + 1; 1248 | else 1249 | high = i - 1; 1250 | } 1251 | 1252 | if (rc > 0) { /* Found entry is less than the key. */ 1253 | i++; /* Skip to get the smallest entry larger than key. */ 1254 | if (i >= NUMKEYS(mp)) 1255 | /* There is no entry larger or equal to the key. */ 1256 | return NULL; 1257 | } 1258 | if (exactp) 1259 | *exactp = (rc == 0); 1260 | if (kip) /* Store the key index if requested. */ 1261 | *kip = i; 1262 | 1263 | return NODEPTR(mp, i); 1264 | } 1265 | 1266 | static void 1267 | cursor_pop_page(struct cursor *cursor) 1268 | { 1269 | struct ppage *top; 1270 | 1271 | top = CURSOR_TOP(cursor); 1272 | CURSOR_POP(cursor); 1273 | top->mpage->ref--; 1274 | 1275 | DPRINTF("popped page %u off cursor %p", top->mpage->pgno, cursor); 1276 | 1277 | free(top); 1278 | } 1279 | 1280 | static struct ppage * 1281 | cursor_push_page(struct cursor *cursor, struct mpage *mp) 1282 | { 1283 | struct ppage *ppage; 1284 | 1285 | DPRINTF("pushing page %u on cursor %p", mp->pgno, cursor); 1286 | 1287 | if ((ppage = calloc(1, sizeof(*ppage))) == NULL) 1288 | return NULL; 1289 | ppage->mpage = mp; 1290 | mp->ref++; 1291 | CURSOR_PUSH(cursor, ppage); 1292 | return ppage; 1293 | } 1294 | 1295 | static struct mpage * 1296 | btree_get_mpage(struct btree *bt, pgno_t pgno) 1297 | { 1298 | struct mpage *mp; 1299 | 1300 | mp = mpage_lookup(bt, pgno); 1301 | if (mp == NULL) { 1302 | if ((mp = calloc(1, sizeof(*mp))) == NULL) 1303 | return NULL; 1304 | if ((mp->page = malloc(bt->head.psize)) == NULL) { 1305 | free(mp); 1306 | return NULL; 1307 | } 1308 | if (btree_read_page(bt, pgno, mp->page) != BT_SUCCESS) { 1309 | mpage_free(mp); 1310 | return NULL; 1311 | } 1312 | mp->pgno = pgno; 1313 | mpage_add(bt, mp); 1314 | } else 1315 | DPRINTF("returning page %u from cache", pgno); 1316 | 1317 | return mp; 1318 | } 1319 | 1320 | static void 1321 | concat_prefix(struct btree *bt, char *s1, size_t n1, char *s2, size_t n2, 1322 | char *cs, size_t *cn) 1323 | { 1324 | assert(*cn >= n1 + n2); 1325 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 1326 | memcpy(cs, s2, n2); 1327 | memcpy(cs + n2, s1, n1); 1328 | } else { 1329 | memcpy(cs, s1, n1); 1330 | memcpy(cs + n1, s2, n2); 1331 | } 1332 | *cn = n1 + n2; 1333 | } 1334 | 1335 | static void 1336 | find_common_prefix(struct btree *bt, struct mpage *mp) 1337 | { 1338 | indx_t lbound = 0, ubound = 0; 1339 | struct mpage *lp, *up; 1340 | struct btkey lprefix, uprefix; 1341 | 1342 | mp->prefix.len = 0; 1343 | if (bt->cmp != NULL) 1344 | return; 1345 | 1346 | lp = mp; 1347 | while (lp->parent != NULL) { 1348 | if (lp->parent_index > 0) { 1349 | lbound = lp->parent_index; 1350 | break; 1351 | } 1352 | lp = lp->parent; 1353 | } 1354 | 1355 | up = mp; 1356 | while (up->parent != NULL) { 1357 | if (up->parent_index + 1 < (indx_t)NUMKEYS(up->parent)) { 1358 | ubound = up->parent_index + 1; 1359 | break; 1360 | } 1361 | up = up->parent; 1362 | } 1363 | 1364 | if (lp->parent != NULL && up->parent != NULL) { 1365 | expand_prefix(bt, lp->parent, lbound, &lprefix); 1366 | expand_prefix(bt, up->parent, ubound, &uprefix); 1367 | common_prefix(bt, &lprefix, &uprefix, &mp->prefix); 1368 | } 1369 | else if (mp->parent) 1370 | memcpy(&mp->prefix, &mp->parent->prefix, sizeof(mp->prefix)); 1371 | 1372 | DPRINTF("found common prefix [%.*s] (len %zu) for page %u", 1373 | (int)mp->prefix.len, mp->prefix.str, mp->prefix.len, mp->pgno); 1374 | } 1375 | 1376 | static int 1377 | btree_search_page_root(struct btree *bt, struct mpage *root, struct btval *key, 1378 | struct cursor *cursor, int modify, struct mpage **mpp) 1379 | { 1380 | struct mpage *mp, *parent; 1381 | 1382 | if (cursor && cursor_push_page(cursor, root) == NULL) 1383 | return BT_FAIL; 1384 | 1385 | mp = root; 1386 | while (IS_BRANCH(mp)) { 1387 | unsigned int i = 0; 1388 | struct node *node; 1389 | 1390 | DPRINTF("branch page %u has %lu keys", mp->pgno, NUMKEYS(mp)); 1391 | assert(NUMKEYS(mp) > 1); 1392 | DPRINTF("found index 0 to page %u", NODEPGNO(NODEPTR(mp, 0))); 1393 | 1394 | if (key == NULL) /* Initialize cursor to first page. */ 1395 | i = 0; 1396 | else { 1397 | int exact; 1398 | node = btree_search_node(bt, mp, key, &exact, &i); 1399 | if (node == NULL) 1400 | i = NUMKEYS(mp) - 1; 1401 | else if (!exact) { 1402 | assert(i > 0); 1403 | i--; 1404 | } 1405 | } 1406 | 1407 | if (key) 1408 | DPRINTF("following index %u for key %.*s", 1409 | i, (int)key->size, (char *)key->data); 1410 | assert((int)i >= 0 && i < NUMKEYS(mp)); 1411 | node = NODEPTR(mp, i); 1412 | 1413 | if (cursor) 1414 | CURSOR_TOP(cursor)->ki = i; 1415 | 1416 | parent = mp; 1417 | if ((mp = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 1418 | return BT_FAIL; 1419 | mp->parent = parent; 1420 | mp->parent_index = i; 1421 | find_common_prefix(bt, mp); 1422 | 1423 | if (cursor && cursor_push_page(cursor, mp) == NULL) 1424 | return BT_FAIL; 1425 | 1426 | if (modify && (mp = mpage_touch(bt, mp)) == NULL) 1427 | return BT_FAIL; 1428 | } 1429 | 1430 | if (!IS_LEAF(mp)) { 1431 | DPRINTF("internal error, index points to a %02X page!?", 1432 | mp->page->flags); 1433 | return BT_FAIL; 1434 | } 1435 | 1436 | DPRINTF("found leaf page %u for key %.*s", mp->pgno, 1437 | key ? (int)key->size : 0, key ? (char *)key->data : NULL); 1438 | 1439 | *mpp = mp; 1440 | return BT_SUCCESS; 1441 | } 1442 | 1443 | /* Search for the page a given key should be in. 1444 | * Stores a pointer to the found page in *mpp. 1445 | * If key is NULL, search for the lowest page (used by btree_cursor_first). 1446 | * If cursor is non-null, pushes parent pages on the cursor stack. 1447 | * If modify is true, visited pages are updated with new page numbers. 1448 | */ 1449 | static int 1450 | btree_search_page(struct btree *bt, struct btree_txn *txn, struct btval *key, 1451 | struct cursor *cursor, int modify, struct mpage **mpp) 1452 | { 1453 | int rc; 1454 | pgno_t root; 1455 | struct mpage *mp; 1456 | 1457 | /* Can't modify pages outside a transaction. */ 1458 | if (txn == NULL && modify) { 1459 | errno = EINVAL; 1460 | return BT_FAIL; 1461 | } 1462 | 1463 | /* Choose which root page to start with. If a transaction is given 1464 | * use the root page from the transaction, otherwise read the last 1465 | * committed root page. 1466 | */ 1467 | if (txn == NULL) { 1468 | if ((rc = btree_read_meta(bt, NULL)) != BT_SUCCESS) 1469 | return rc; 1470 | root = bt->meta.root; 1471 | } else if (F_ISSET(txn->flags, BT_TXN_ERROR)) { 1472 | DPRINTF("transaction has failed, must abort"); 1473 | errno = EINVAL; 1474 | return BT_FAIL; 1475 | } else 1476 | root = txn->root; 1477 | 1478 | if (root == P_INVALID) { /* Tree is empty. */ 1479 | DPRINTF("tree is empty"); 1480 | errno = ENOENT; 1481 | return BT_FAIL; 1482 | } 1483 | 1484 | if ((mp = btree_get_mpage(bt, root)) == NULL) 1485 | return BT_FAIL; 1486 | 1487 | DPRINTF("root page has flags 0x%X", mp->page->flags); 1488 | 1489 | assert(mp->parent == NULL); 1490 | assert(mp->prefix.len == 0); 1491 | 1492 | if (modify && !mp->dirty) { 1493 | if ((mp = mpage_touch(bt, mp)) == NULL) 1494 | return BT_FAIL; 1495 | txn->root = mp->pgno; 1496 | } 1497 | 1498 | return btree_search_page_root(bt, mp, key, cursor, modify, mpp); 1499 | } 1500 | 1501 | static int 1502 | btree_read_data(struct btree *bt, struct mpage *mp, struct node *leaf, 1503 | struct btval *data) 1504 | { 1505 | struct mpage *omp; /* overflow mpage */ 1506 | size_t psz; 1507 | size_t max; 1508 | size_t sz = 0; 1509 | pgno_t pgno; 1510 | 1511 | memset(data, 0, sizeof(*data)); 1512 | max = bt->head.psize - PAGEHDRSZ; 1513 | 1514 | if (!F_ISSET(leaf->flags, F_BIGDATA)) { 1515 | data->size = leaf->n_dsize; 1516 | if (data->size > 0) { 1517 | if (mp == NULL) { 1518 | if ((data->data = malloc(data->size)) == NULL) 1519 | return BT_FAIL; 1520 | memcpy(data->data, NODEDATA(leaf), data->size); 1521 | data->free_data = 1; 1522 | data->mp = NULL; 1523 | } else { 1524 | data->data = NODEDATA(leaf); 1525 | data->free_data = 0; 1526 | data->mp = mp; 1527 | mp->ref++; 1528 | } 1529 | } 1530 | return BT_SUCCESS; 1531 | } 1532 | 1533 | /* Read overflow data. 1534 | */ 1535 | DPRINTF("allocating %u byte for overflow data", leaf->n_dsize); 1536 | if ((data->data = malloc(leaf->n_dsize)) == NULL) 1537 | return BT_FAIL; 1538 | data->size = leaf->n_dsize; 1539 | data->free_data = 1; 1540 | data->mp = NULL; 1541 | memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); 1542 | for (sz = 0; sz < data->size; ) { 1543 | if ((omp = btree_get_mpage(bt, pgno)) == NULL || 1544 | !F_ISSET(omp->page->flags, P_OVERFLOW)) { 1545 | DPRINTF("read overflow page %u failed", pgno); 1546 | free(data->data); 1547 | mpage_free(omp); 1548 | return BT_FAIL; 1549 | } 1550 | psz = data->size - sz; 1551 | if (psz > max) 1552 | psz = max; 1553 | memcpy((char *)data->data + sz, omp->page->ptrs, psz); 1554 | sz += psz; 1555 | pgno = omp->page->p_next_pgno; 1556 | } 1557 | 1558 | return BT_SUCCESS; 1559 | } 1560 | 1561 | int 1562 | btree_txn_get(struct btree *bt, struct btree_txn *txn, 1563 | struct btval *key, struct btval *data) 1564 | { 1565 | int rc, exact; 1566 | struct node *leaf; 1567 | struct mpage *mp; 1568 | 1569 | assert(key); 1570 | assert(data); 1571 | DPRINTF("===> get key [%.*s]", (int)key->size, (char *)key->data); 1572 | 1573 | if (bt != NULL && txn != NULL && bt != txn->bt) { 1574 | errno = EINVAL; 1575 | return BT_FAIL; 1576 | } 1577 | 1578 | if (bt == NULL) { 1579 | if (txn == NULL) { 1580 | errno = EINVAL; 1581 | return BT_FAIL; 1582 | } 1583 | bt = txn->bt; 1584 | } 1585 | 1586 | if (key->size == 0 || key->size > MAXKEYSIZE) { 1587 | errno = EINVAL; 1588 | return BT_FAIL; 1589 | } 1590 | 1591 | if ((rc = btree_search_page(bt, txn, key, NULL, 0, &mp)) != BT_SUCCESS) 1592 | return rc; 1593 | 1594 | leaf = btree_search_node(bt, mp, key, &exact, NULL); 1595 | if (leaf && exact) 1596 | rc = btree_read_data(bt, mp, leaf, data); 1597 | else { 1598 | errno = ENOENT; 1599 | rc = BT_FAIL; 1600 | } 1601 | 1602 | mpage_prune(bt); 1603 | return rc; 1604 | } 1605 | 1606 | static int 1607 | btree_sibling(struct cursor *cursor, int move_right) 1608 | { 1609 | int rc; 1610 | struct node *indx; 1611 | struct ppage *parent, *top; 1612 | struct mpage *mp; 1613 | 1614 | top = CURSOR_TOP(cursor); 1615 | if ((parent = SLIST_NEXT(top, entry)) == NULL) { 1616 | errno = ENOENT; 1617 | return BT_FAIL; /* root has no siblings */ 1618 | } 1619 | 1620 | DPRINTF("parent page is page %u, index %u", 1621 | parent->mpage->pgno, parent->ki); 1622 | 1623 | cursor_pop_page(cursor); 1624 | if (move_right ? (parent->ki + 1 >= NUMKEYS(parent->mpage)) 1625 | : (parent->ki == 0)) { 1626 | DPRINTF("no more keys left, moving to %s sibling", 1627 | move_right ? "right" : "left"); 1628 | if ((rc = btree_sibling(cursor, move_right)) != BT_SUCCESS) 1629 | return rc; 1630 | parent = CURSOR_TOP(cursor); 1631 | } else { 1632 | if (move_right) 1633 | parent->ki++; 1634 | else 1635 | parent->ki--; 1636 | DPRINTF("just moving to %s index key %u", 1637 | move_right ? "right" : "left", parent->ki); 1638 | } 1639 | assert(IS_BRANCH(parent->mpage)); 1640 | 1641 | indx = NODEPTR(parent->mpage, parent->ki); 1642 | if ((mp = btree_get_mpage(cursor->bt, indx->n_pgno)) == NULL) 1643 | return BT_FAIL; 1644 | mp->parent = parent->mpage; 1645 | mp->parent_index = parent->ki; 1646 | 1647 | cursor_push_page(cursor, mp); 1648 | find_common_prefix(cursor->bt, mp); 1649 | 1650 | return BT_SUCCESS; 1651 | } 1652 | 1653 | static int 1654 | bt_set_key(struct btree *bt, struct mpage *mp, struct node *node, 1655 | struct btval *key) 1656 | { 1657 | if (key == NULL) 1658 | return 0; 1659 | 1660 | if (mp->prefix.len > 0) { 1661 | key->size = node->ksize + mp->prefix.len; 1662 | key->data = malloc(key->size); 1663 | if (key->data == NULL) 1664 | return -1; 1665 | concat_prefix(bt, 1666 | mp->prefix.str, mp->prefix.len, 1667 | NODEKEY(node), node->ksize, 1668 | key->data, &key->size); 1669 | key->free_data = 1; 1670 | } else { 1671 | key->size = node->ksize; 1672 | key->data = NODEKEY(node); 1673 | key->free_data = 0; 1674 | key->mp = mp; 1675 | mp->ref++; 1676 | } 1677 | 1678 | return 0; 1679 | } 1680 | 1681 | static int 1682 | btree_cursor_next(struct cursor *cursor, struct btval *key, struct btval *data) 1683 | { 1684 | struct ppage *top; 1685 | struct mpage *mp; 1686 | struct node *leaf; 1687 | 1688 | if (cursor->eof) { 1689 | errno = ENOENT; 1690 | return BT_FAIL; 1691 | } 1692 | 1693 | assert(cursor->initialized); 1694 | 1695 | top = CURSOR_TOP(cursor); 1696 | mp = top->mpage; 1697 | 1698 | DPRINTF("cursor_next: top page is %u in cursor %p", mp->pgno, cursor); 1699 | 1700 | if (top->ki + 1 >= NUMKEYS(mp)) { 1701 | DPRINTF("=====> move to next sibling page"); 1702 | if (btree_sibling(cursor, 1) != BT_SUCCESS) { 1703 | cursor->eof = 1; 1704 | return BT_FAIL; 1705 | } 1706 | top = CURSOR_TOP(cursor); 1707 | mp = top->mpage; 1708 | DPRINTF("next page is %u, key index %u", mp->pgno, top->ki); 1709 | } else 1710 | top->ki++; 1711 | 1712 | DPRINTF("==> cursor points to page %u with %lu keys, key index %u", 1713 | mp->pgno, NUMKEYS(mp), top->ki); 1714 | 1715 | assert(IS_LEAF(mp)); 1716 | leaf = NODEPTR(mp, top->ki); 1717 | 1718 | if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1719 | return BT_FAIL; 1720 | 1721 | if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1722 | return BT_FAIL; 1723 | 1724 | return BT_SUCCESS; 1725 | } 1726 | 1727 | static int 1728 | btree_cursor_set(struct cursor *cursor, struct btval *key, struct btval *data, 1729 | int *exactp) 1730 | { 1731 | int rc; 1732 | struct node *leaf; 1733 | struct mpage *mp; 1734 | struct ppage *top; 1735 | 1736 | assert(cursor); 1737 | assert(key); 1738 | assert(key->size > 0); 1739 | 1740 | rc = btree_search_page(cursor->bt, cursor->txn, key, cursor, 0, &mp); 1741 | if (rc != BT_SUCCESS) 1742 | return rc; 1743 | assert(IS_LEAF(mp)); 1744 | 1745 | top = CURSOR_TOP(cursor); 1746 | leaf = btree_search_node(cursor->bt, mp, key, exactp, &top->ki); 1747 | if (exactp != NULL && !*exactp) { 1748 | /* BT_CURSOR_EXACT specified and not an exact match. */ 1749 | errno = ENOENT; 1750 | return BT_FAIL; 1751 | } 1752 | 1753 | if (leaf == NULL) { 1754 | DPRINTF("===> inexact leaf not found, goto sibling"); 1755 | if (btree_sibling(cursor, 1) != BT_SUCCESS) 1756 | return BT_FAIL; /* no entries matched */ 1757 | top = CURSOR_TOP(cursor); 1758 | top->ki = 0; 1759 | mp = top->mpage; 1760 | assert(IS_LEAF(mp)); 1761 | leaf = NODEPTR(mp, 0); 1762 | } 1763 | 1764 | cursor->initialized = 1; 1765 | cursor->eof = 0; 1766 | 1767 | if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1768 | return BT_FAIL; 1769 | 1770 | if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1771 | return BT_FAIL; 1772 | DPRINTF("==> cursor placed on key %.*s", 1773 | (int)key->size, (char *)key->data); 1774 | 1775 | return BT_SUCCESS; 1776 | } 1777 | 1778 | static int 1779 | btree_cursor_first(struct cursor *cursor, struct btval *key, struct btval *data) 1780 | { 1781 | int rc; 1782 | struct mpage *mp; 1783 | struct node *leaf; 1784 | 1785 | rc = btree_search_page(cursor->bt, cursor->txn, NULL, cursor, 0, &mp); 1786 | if (rc != BT_SUCCESS) 1787 | return rc; 1788 | assert(IS_LEAF(mp)); 1789 | 1790 | leaf = NODEPTR(mp, 0); 1791 | cursor->initialized = 1; 1792 | cursor->eof = 0; 1793 | 1794 | if (data && btree_read_data(cursor->bt, mp, leaf, data) != BT_SUCCESS) 1795 | return BT_FAIL; 1796 | 1797 | if (bt_set_key(cursor->bt, mp, leaf, key) != 0) 1798 | return BT_FAIL; 1799 | 1800 | return BT_SUCCESS; 1801 | } 1802 | 1803 | int 1804 | btree_cursor_get(struct cursor *cursor, struct btval *key, struct btval *data, 1805 | enum cursor_op op) 1806 | { 1807 | int rc; 1808 | int exact = 0; 1809 | 1810 | assert(cursor); 1811 | 1812 | switch (op) { 1813 | case BT_CURSOR: 1814 | case BT_CURSOR_EXACT: 1815 | while (CURSOR_TOP(cursor) != NULL) 1816 | cursor_pop_page(cursor); 1817 | if (key == NULL || key->size == 0 || key->size > MAXKEYSIZE) { 1818 | errno = EINVAL; 1819 | rc = BT_FAIL; 1820 | } else if (op == BT_CURSOR_EXACT) 1821 | rc = btree_cursor_set(cursor, key, data, &exact); 1822 | else 1823 | rc = btree_cursor_set(cursor, key, data, NULL); 1824 | break; 1825 | case BT_NEXT: 1826 | if (!cursor->initialized) 1827 | rc = btree_cursor_first(cursor, key, data); 1828 | else 1829 | rc = btree_cursor_next(cursor, key, data); 1830 | break; 1831 | case BT_FIRST: 1832 | while (CURSOR_TOP(cursor) != NULL) 1833 | cursor_pop_page(cursor); 1834 | rc = btree_cursor_first(cursor, key, data); 1835 | break; 1836 | default: 1837 | DPRINTF("unhandled/unimplemented cursor operation %u", op); 1838 | rc = BT_FAIL; 1839 | break; 1840 | } 1841 | 1842 | mpage_prune(cursor->bt); 1843 | 1844 | return rc; 1845 | } 1846 | 1847 | static struct mpage * 1848 | btree_new_page(struct btree *bt, uint32_t flags) 1849 | { 1850 | struct mpage *mp; 1851 | 1852 | assert(bt != NULL); 1853 | assert(bt->txn != NULL); 1854 | 1855 | DPRINTF("allocating new mpage %u, page size %u", 1856 | bt->txn->next_pgno, bt->head.psize); 1857 | if ((mp = calloc(1, sizeof(*mp))) == NULL) 1858 | return NULL; 1859 | if ((mp->page = malloc(bt->head.psize)) == NULL) { 1860 | free(mp); 1861 | return NULL; 1862 | } 1863 | mp->pgno = mp->page->pgno = bt->txn->next_pgno++; 1864 | mp->page->flags = flags; 1865 | mp->page->lower = PAGEHDRSZ; 1866 | mp->page->upper = bt->head.psize; 1867 | 1868 | if (IS_BRANCH(mp)) 1869 | bt->meta.branch_pages++; 1870 | else if (IS_LEAF(mp)) 1871 | bt->meta.leaf_pages++; 1872 | else if (IS_OVERFLOW(mp)) 1873 | bt->meta.overflow_pages++; 1874 | 1875 | mpage_add(bt, mp); 1876 | mpage_dirty(bt, mp); 1877 | 1878 | return mp; 1879 | } 1880 | 1881 | static size_t 1882 | bt_leaf_size(struct btree *bt, struct btval *key, struct btval *data) 1883 | { 1884 | size_t sz; 1885 | 1886 | sz = LEAFSIZE(key, data); 1887 | if (data->size >= bt->head.psize / BT_MINKEYS) { 1888 | /* put on overflow page */ 1889 | sz -= data->size - sizeof(pgno_t); 1890 | } 1891 | 1892 | return sz + sizeof(indx_t); 1893 | } 1894 | 1895 | static size_t 1896 | bt_branch_size(struct btree *bt, struct btval *key) 1897 | { 1898 | size_t sz; 1899 | 1900 | sz = INDXSIZE(key); 1901 | if (sz >= bt->head.psize / BT_MINKEYS) { 1902 | /* put on overflow page */ 1903 | /* not implemented */ 1904 | /* sz -= key->size - sizeof(pgno_t); */ 1905 | } 1906 | 1907 | return sz + sizeof(indx_t); 1908 | } 1909 | 1910 | static int 1911 | btree_write_overflow_data(struct btree *bt, struct page *p, struct btval *data) 1912 | { 1913 | size_t done = 0; 1914 | size_t sz; 1915 | size_t max; 1916 | pgno_t *linkp; /* linked page stored here */ 1917 | struct mpage *next = NULL; 1918 | 1919 | max = bt->head.psize - PAGEHDRSZ; 1920 | 1921 | while (done < data->size) { 1922 | if (next != NULL) 1923 | p = next->page; 1924 | linkp = &p->p_next_pgno; 1925 | if (data->size - done > max) { 1926 | /* need another overflow page */ 1927 | if ((next = btree_new_page(bt, P_OVERFLOW)) == NULL) 1928 | return BT_FAIL; 1929 | *linkp = next->pgno; 1930 | DPRINTF("linking overflow page %u", next->pgno); 1931 | } else 1932 | *linkp = 0; /* indicates end of list */ 1933 | sz = data->size - done; 1934 | if (sz > max) 1935 | sz = max; 1936 | DPRINTF("copying %zu bytes to overflow page %u", sz, p->pgno); 1937 | memcpy(p->ptrs, (char *)data->data + done, sz); 1938 | done += sz; 1939 | } 1940 | 1941 | return BT_SUCCESS; 1942 | } 1943 | 1944 | /* Key prefix should already be stripped. 1945 | */ 1946 | static int 1947 | btree_add_node(struct btree *bt, struct mpage *mp, indx_t indx, 1948 | struct btval *key, struct btval *data, pgno_t pgno, uint8_t flags) 1949 | { 1950 | unsigned int i; 1951 | size_t node_size = NODESIZE; 1952 | indx_t ofs; 1953 | struct node *node; 1954 | struct page *p; 1955 | struct mpage *ofp = NULL; /* overflow page */ 1956 | 1957 | p = mp->page; 1958 | assert(p->upper >= p->lower); 1959 | 1960 | DPRINTF("add node [%.*s] to %s page %u at index %i, key size %zu", 1961 | key ? (int)key->size : 0, key ? (char *)key->data : NULL, 1962 | IS_LEAF(mp) ? "leaf" : "branch", 1963 | mp->pgno, indx, key ? key->size : 0); 1964 | 1965 | if (key != NULL) 1966 | node_size += key->size; 1967 | 1968 | if (IS_LEAF(mp)) { 1969 | assert(data); 1970 | node_size += data->size; 1971 | if (F_ISSET(flags, F_BIGDATA)) { 1972 | /* Data already on overflow page. */ 1973 | node_size -= data->size - sizeof(pgno_t); 1974 | } else if (data->size >= bt->head.psize / BT_MINKEYS) { 1975 | /* Put data on overflow page. */ 1976 | DPRINTF("data size is %zu, put on overflow page", 1977 | data->size); 1978 | node_size -= data->size - sizeof(pgno_t); 1979 | if ((ofp = btree_new_page(bt, P_OVERFLOW)) == NULL) 1980 | return BT_FAIL; 1981 | DPRINTF("allocated overflow page %u", ofp->pgno); 1982 | flags |= F_BIGDATA; 1983 | } 1984 | } 1985 | 1986 | if (node_size + sizeof(indx_t) > SIZELEFT(mp)) { 1987 | DPRINTF("not enough room in page %u, got %lu ptrs", 1988 | mp->pgno, NUMKEYS(mp)); 1989 | DPRINTF("upper - lower = %u - %u = %u", p->upper, p->lower, 1990 | p->upper - p->lower); 1991 | DPRINTF("node size = %zu", node_size); 1992 | return BT_FAIL; 1993 | } 1994 | 1995 | /* Move higher pointers up one slot. */ 1996 | for (i = NUMKEYS(mp); i > indx; i--) 1997 | p->ptrs[i] = p->ptrs[i - 1]; 1998 | 1999 | /* Adjust free space offsets. */ 2000 | ofs = p->upper - node_size; 2001 | assert(ofs >= p->lower + sizeof(indx_t)); 2002 | p->ptrs[indx] = ofs; 2003 | p->upper = ofs; 2004 | p->lower += sizeof(indx_t); 2005 | 2006 | /* Write the node data. */ 2007 | node = NODEPTR(mp, indx); 2008 | node->ksize = (key == NULL) ? 0 : key->size; 2009 | node->flags = flags; 2010 | if (IS_LEAF(mp)) 2011 | node->n_dsize = data->size; 2012 | else 2013 | node->n_pgno = pgno; 2014 | 2015 | if (key) 2016 | memcpy(NODEKEY(node), key->data, key->size); 2017 | 2018 | if (IS_LEAF(mp)) { 2019 | assert(key); 2020 | if (ofp == NULL) { 2021 | if (F_ISSET(flags, F_BIGDATA)) 2022 | memcpy(node->data + key->size, data->data, 2023 | sizeof(pgno_t)); 2024 | else 2025 | memcpy(node->data + key->size, data->data, 2026 | data->size); 2027 | } else { 2028 | memcpy(node->data + key->size, &ofp->pgno, 2029 | sizeof(pgno_t)); 2030 | if (btree_write_overflow_data(bt, ofp->page, 2031 | data) == BT_FAIL) 2032 | return BT_FAIL; 2033 | } 2034 | } 2035 | 2036 | return BT_SUCCESS; 2037 | } 2038 | 2039 | static void 2040 | btree_del_node(struct mpage *mp, indx_t indx) 2041 | { 2042 | unsigned int sz; 2043 | indx_t i, j, numkeys, ptr; 2044 | struct node *node; 2045 | char *base; 2046 | 2047 | DPRINTF("delete node %u on %s page %u", indx, 2048 | IS_LEAF(mp) ? "leaf" : "branch", mp->pgno); 2049 | assert(indx < NUMKEYS(mp)); 2050 | 2051 | node = NODEPTR(mp, indx); 2052 | sz = NODESIZE + node->ksize; 2053 | if (IS_LEAF(mp)) { 2054 | if (F_ISSET(node->flags, F_BIGDATA)) 2055 | sz += sizeof(pgno_t); 2056 | else 2057 | sz += NODEDSZ(node); 2058 | } 2059 | 2060 | ptr = mp->page->ptrs[indx]; 2061 | numkeys = NUMKEYS(mp); 2062 | for (i = j = 0; i < numkeys; i++) { 2063 | if (i != indx) { 2064 | mp->page->ptrs[j] = mp->page->ptrs[i]; 2065 | if (mp->page->ptrs[i] < ptr) 2066 | mp->page->ptrs[j] += sz; 2067 | j++; 2068 | } 2069 | } 2070 | 2071 | base = (char *)mp->page + mp->page->upper; 2072 | memcpy(base + sz, base, ptr - mp->page->upper); 2073 | 2074 | mp->page->lower -= sizeof(indx_t); 2075 | mp->page->upper += sz; 2076 | } 2077 | 2078 | struct cursor * 2079 | btree_txn_cursor_open(struct btree *bt, struct btree_txn *txn) 2080 | { 2081 | struct cursor *cursor; 2082 | 2083 | if (bt != NULL && txn != NULL && bt != txn->bt) { 2084 | errno = EINVAL; 2085 | return NULL; 2086 | } 2087 | 2088 | if (bt == NULL) { 2089 | if (txn == NULL) { 2090 | errno = EINVAL; 2091 | return NULL; 2092 | } 2093 | bt = txn->bt; 2094 | } 2095 | 2096 | if ((cursor = calloc(1, sizeof(*cursor))) != NULL) { 2097 | SLIST_INIT(&cursor->stack); 2098 | cursor->bt = bt; 2099 | cursor->txn = txn; 2100 | btree_ref(bt); 2101 | } 2102 | 2103 | return cursor; 2104 | } 2105 | 2106 | void 2107 | btree_cursor_close(struct cursor *cursor) 2108 | { 2109 | if (cursor != NULL) { 2110 | while (!CURSOR_EMPTY(cursor)) 2111 | cursor_pop_page(cursor); 2112 | 2113 | btree_close(cursor->bt); 2114 | free(cursor); 2115 | } 2116 | } 2117 | 2118 | static int 2119 | btree_update_key(struct mpage *mp, indx_t indx, 2120 | struct btval *key) 2121 | { 2122 | indx_t ptr, i, numkeys; 2123 | int delta; 2124 | size_t len; 2125 | struct node *node; 2126 | char *base; 2127 | 2128 | node = NODEPTR(mp, indx); 2129 | ptr = mp->page->ptrs[indx]; 2130 | DPRINTF("update key %u (ofs %u) [%.*s] to [%.*s] on page %u", 2131 | indx, ptr, 2132 | (int)node->ksize, (char *)NODEKEY(node), 2133 | (int)key->size, (char *)key->data, 2134 | mp->pgno); 2135 | 2136 | if (key->size != node->ksize) { 2137 | delta = key->size - node->ksize; 2138 | if (delta > 0 && SIZELEFT(mp) < delta) { 2139 | DPRINTF("OUCH! Not enough room, delta = %d", delta); 2140 | return BT_FAIL; 2141 | } 2142 | 2143 | numkeys = NUMKEYS(mp); 2144 | for (i = 0; i < numkeys; i++) { 2145 | if (mp->page->ptrs[i] <= ptr) 2146 | mp->page->ptrs[i] -= delta; 2147 | } 2148 | 2149 | base = (char *)mp->page + mp->page->upper; 2150 | len = ptr - mp->page->upper + NODESIZE; 2151 | memcpy(base - delta, base, len); 2152 | mp->page->upper -= delta; 2153 | 2154 | node = NODEPTR(mp, indx); 2155 | node->ksize = key->size; 2156 | } 2157 | 2158 | memcpy(NODEKEY(node), key->data, key->size); 2159 | 2160 | return BT_SUCCESS; 2161 | } 2162 | 2163 | static int 2164 | btree_adjust_prefix(struct btree *bt, struct mpage *src, int delta) 2165 | { 2166 | indx_t i; 2167 | struct node *node; 2168 | struct btkey tmpkey; 2169 | struct btval key; 2170 | 2171 | DPRINTF("adjusting prefix lengths on page %u with delta %d", 2172 | src->pgno, delta); 2173 | assert(delta != 0); 2174 | 2175 | for (i = 0; i < NUMKEYS(src); i++) { 2176 | node = NODEPTR(src, i); 2177 | tmpkey.len = node->ksize - delta; 2178 | if (delta > 0) { 2179 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) 2180 | memcpy(tmpkey.str, NODEKEY(node), tmpkey.len); 2181 | else 2182 | memcpy(tmpkey.str, (char *)NODEKEY(node) + delta, 2183 | tmpkey.len); 2184 | } else { 2185 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2186 | memcpy(tmpkey.str, NODEKEY(node), node->ksize); 2187 | memcpy(tmpkey.str + node->ksize, src->prefix.str, 2188 | -delta); 2189 | } else { 2190 | bcopy(src->prefix.str + src->prefix.len + delta, 2191 | tmpkey.str, -delta); 2192 | memcpy(tmpkey.str - delta, NODEKEY(node), 2193 | node->ksize); 2194 | } 2195 | } 2196 | key.size = tmpkey.len; 2197 | key.data = tmpkey.str; 2198 | if (btree_update_key(src, i, &key) != BT_SUCCESS) 2199 | return BT_FAIL; 2200 | } 2201 | 2202 | return BT_SUCCESS; 2203 | } 2204 | 2205 | /* Move a node from src to dst. 2206 | */ 2207 | static int 2208 | btree_move_node(struct btree *bt, struct mpage *src, indx_t srcindx, 2209 | struct mpage *dst, indx_t dstindx) 2210 | { 2211 | int rc; 2212 | unsigned int pfxlen, mp_pfxlen = 0; 2213 | struct node *srcnode; 2214 | struct mpage *mp = NULL; 2215 | struct btkey tmpkey, srckey; 2216 | struct btval key, data; 2217 | 2218 | assert(src->parent); 2219 | assert(dst->parent); 2220 | 2221 | srcnode = NODEPTR(src, srcindx); 2222 | DPRINTF("moving %s node %u [%.*s] on page %u to node %u on page %u", 2223 | IS_LEAF(src) ? "leaf" : "branch", 2224 | srcindx, 2225 | (int)srcnode->ksize, (char *)NODEKEY(srcnode), 2226 | src->pgno, 2227 | dstindx, dst->pgno); 2228 | 2229 | find_common_prefix(bt, src); 2230 | 2231 | if (IS_BRANCH(src)) { 2232 | /* Need to check if the page the moved node points to 2233 | * changes prefix. 2234 | */ 2235 | if ((mp = btree_get_mpage(bt, NODEPGNO(srcnode))) == NULL) 2236 | return BT_FAIL; 2237 | mp->parent = src; 2238 | mp->parent_index = srcindx; 2239 | find_common_prefix(bt, mp); 2240 | mp_pfxlen = mp->prefix.len; 2241 | } 2242 | 2243 | /* Mark src and dst as dirty. */ 2244 | if ((src = mpage_touch(bt, src)) == NULL || 2245 | (dst = mpage_touch(bt, dst)) == NULL) 2246 | return BT_FAIL; 2247 | 2248 | find_common_prefix(bt, dst); 2249 | 2250 | /* Check if src node has destination page prefix. Otherwise the 2251 | * destination page must expand its prefix on all its nodes. 2252 | */ 2253 | srckey.len = srcnode->ksize; 2254 | memcpy(srckey.str, NODEKEY(srcnode), srckey.len); 2255 | common_prefix(bt, &srckey, &dst->prefix, &tmpkey); 2256 | if (tmpkey.len != dst->prefix.len) { 2257 | if (btree_adjust_prefix(bt, dst, 2258 | tmpkey.len - dst->prefix.len) != BT_SUCCESS) 2259 | return BT_FAIL; 2260 | memcpy(&dst->prefix, &tmpkey, sizeof(tmpkey)); 2261 | } 2262 | 2263 | if (srcindx == 0 && IS_BRANCH(src)) { 2264 | struct mpage *low; 2265 | 2266 | /* must find the lowest key below src 2267 | */ 2268 | assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2269 | &low) == BT_SUCCESS); 2270 | expand_prefix(bt, low, 0, &srckey); 2271 | DPRINTF("found lowest key [%.*s] on leaf page %u", 2272 | (int)srckey.len, srckey.str, low->pgno); 2273 | } else { 2274 | srckey.len = srcnode->ksize; 2275 | memcpy(srckey.str, NODEKEY(srcnode), srcnode->ksize); 2276 | } 2277 | find_common_prefix(bt, src); 2278 | 2279 | /* expand the prefix */ 2280 | tmpkey.len = sizeof(tmpkey.str); 2281 | concat_prefix(bt, src->prefix.str, src->prefix.len, 2282 | srckey.str, srckey.len, tmpkey.str, &tmpkey.len); 2283 | 2284 | /* Add the node to the destination page. Adjust prefix for 2285 | * destination page. 2286 | */ 2287 | key.size = tmpkey.len; 2288 | key.data = tmpkey.str; 2289 | remove_prefix(bt, &key, dst->prefix.len); 2290 | data.size = NODEDSZ(srcnode); 2291 | data.data = NODEDATA(srcnode); 2292 | rc = btree_add_node(bt, dst, dstindx, &key, &data, NODEPGNO(srcnode), 2293 | srcnode->flags); 2294 | if (rc != BT_SUCCESS) 2295 | return rc; 2296 | 2297 | /* Delete the node from the source page. 2298 | */ 2299 | btree_del_node(src, srcindx); 2300 | 2301 | /* Update the parent separators. 2302 | */ 2303 | if (srcindx == 0 && src->parent_index != 0) { 2304 | expand_prefix(bt, src, 0, &tmpkey); 2305 | key.size = tmpkey.len; 2306 | key.data = tmpkey.str; 2307 | remove_prefix(bt, &key, src->parent->prefix.len); 2308 | 2309 | DPRINTF("update separator for source page %u to [%.*s]", 2310 | src->pgno, (int)key.size, (char *)key.data); 2311 | if (btree_update_key(src->parent, src->parent_index, 2312 | &key) != BT_SUCCESS) 2313 | return BT_FAIL; 2314 | } 2315 | 2316 | if (srcindx == 0 && IS_BRANCH(src)) { 2317 | struct btval nullkey; 2318 | nullkey.size = 0; 2319 | assert(btree_update_key(src, 0, &nullkey) == BT_SUCCESS); 2320 | } 2321 | 2322 | if (dstindx == 0 && dst->parent_index != 0) { 2323 | expand_prefix(bt, dst, 0, &tmpkey); 2324 | key.size = tmpkey.len; 2325 | key.data = tmpkey.str; 2326 | remove_prefix(bt, &key, dst->parent->prefix.len); 2327 | 2328 | DPRINTF("update separator for destination page %u to [%.*s]", 2329 | dst->pgno, (int)key.size, (char *)key.data); 2330 | if (btree_update_key(dst->parent, dst->parent_index, 2331 | &key) != BT_SUCCESS) 2332 | return BT_FAIL; 2333 | } 2334 | 2335 | if (dstindx == 0 && IS_BRANCH(dst)) { 2336 | struct btval nullkey; 2337 | nullkey.size = 0; 2338 | assert(btree_update_key(dst, 0, &nullkey) == BT_SUCCESS); 2339 | } 2340 | 2341 | /* We can get a new page prefix here! 2342 | * Must update keys in all nodes of this page! 2343 | */ 2344 | pfxlen = src->prefix.len; 2345 | find_common_prefix(bt, src); 2346 | if (src->prefix.len != pfxlen) { 2347 | if (btree_adjust_prefix(bt, src, 2348 | src->prefix.len - pfxlen) != BT_SUCCESS) 2349 | return BT_FAIL; 2350 | } 2351 | 2352 | pfxlen = dst->prefix.len; 2353 | find_common_prefix(bt, dst); 2354 | if (dst->prefix.len != pfxlen) { 2355 | if (btree_adjust_prefix(bt, dst, 2356 | dst->prefix.len - pfxlen) != BT_SUCCESS) 2357 | return BT_FAIL; 2358 | } 2359 | 2360 | if (IS_BRANCH(dst)) { 2361 | assert(mp); 2362 | mp->parent = dst; 2363 | mp->parent_index = dstindx; 2364 | find_common_prefix(bt, mp); 2365 | if (mp->prefix.len != mp_pfxlen) { 2366 | DPRINTF("moved branch node has changed prefix"); 2367 | if ((mp = mpage_touch(bt, mp)) == NULL) 2368 | return BT_FAIL; 2369 | if (btree_adjust_prefix(bt, mp, 2370 | mp->prefix.len - mp_pfxlen) != BT_SUCCESS) 2371 | return BT_FAIL; 2372 | } 2373 | } 2374 | 2375 | return BT_SUCCESS; 2376 | } 2377 | 2378 | static int 2379 | btree_merge(struct btree *bt, struct mpage *src, struct mpage *dst) 2380 | { 2381 | int rc; 2382 | indx_t i; 2383 | unsigned int pfxlen; 2384 | struct node *srcnode; 2385 | struct btkey tmpkey, dstpfx; 2386 | struct btval key, data; 2387 | 2388 | DPRINTF("merging page %u and %u", src->pgno, dst->pgno); 2389 | 2390 | assert(src->parent); /* can't merge root page */ 2391 | assert(dst->parent); 2392 | assert(bt->txn != NULL); 2393 | 2394 | /* Mark src and dst as dirty. */ 2395 | if ((src = mpage_touch(bt, src)) == NULL || 2396 | (dst = mpage_touch(bt, dst)) == NULL) 2397 | return BT_FAIL; 2398 | 2399 | find_common_prefix(bt, src); 2400 | find_common_prefix(bt, dst); 2401 | 2402 | /* Check if source nodes has destination page prefix. Otherwise 2403 | * the destination page must expand its prefix on all its nodes. 2404 | */ 2405 | common_prefix(bt, &src->prefix, &dst->prefix, &dstpfx); 2406 | if (dstpfx.len != dst->prefix.len) { 2407 | if (btree_adjust_prefix(bt, dst, 2408 | dstpfx.len - dst->prefix.len) != BT_SUCCESS) 2409 | return BT_FAIL; 2410 | memcpy(&dst->prefix, &dstpfx, sizeof(dstpfx)); 2411 | } 2412 | 2413 | /* Move all nodes from src to dst. 2414 | */ 2415 | for (i = 0; i < NUMKEYS(src); i++) { 2416 | srcnode = NODEPTR(src, i); 2417 | 2418 | /* If branch node 0 (implicit key), find the real key. 2419 | */ 2420 | if (i == 0 && IS_BRANCH(src)) { 2421 | struct mpage *low; 2422 | 2423 | /* must find the lowest key below src 2424 | */ 2425 | assert(btree_search_page_root(bt, src, NULL, NULL, 0, 2426 | &low) == BT_SUCCESS); 2427 | expand_prefix(bt, low, 0, &tmpkey); 2428 | DPRINTF("found lowest key [%.*s] on leaf page %u", 2429 | (int)tmpkey.len, tmpkey.str, low->pgno); 2430 | } else { 2431 | expand_prefix(bt, src, i, &tmpkey); 2432 | } 2433 | 2434 | key.size = tmpkey.len; 2435 | key.data = tmpkey.str; 2436 | 2437 | remove_prefix(bt, &key, dst->prefix.len); 2438 | data.size = NODEDSZ(srcnode); 2439 | data.data = NODEDATA(srcnode); 2440 | rc = btree_add_node(bt, dst, NUMKEYS(dst), &key, 2441 | &data, NODEPGNO(srcnode), srcnode->flags); 2442 | if (rc != BT_SUCCESS) 2443 | return rc; 2444 | } 2445 | 2446 | DPRINTF("dst page %u now has %lu keys (%.1f%% filled)", 2447 | dst->pgno, NUMKEYS(dst), (float)PAGEFILL(bt, dst) / 10); 2448 | 2449 | /* Unlink the src page from parent. 2450 | */ 2451 | btree_del_node(src->parent, src->parent_index); 2452 | if (src->parent_index == 0) { 2453 | key.size = 0; 2454 | if (btree_update_key(src->parent, 0, &key) != BT_SUCCESS) 2455 | return BT_FAIL; 2456 | 2457 | pfxlen = src->prefix.len; 2458 | find_common_prefix(bt, src); 2459 | assert (src->prefix.len == pfxlen); 2460 | } 2461 | 2462 | if (IS_LEAF(src)) 2463 | bt->meta.leaf_pages--; 2464 | else 2465 | bt->meta.branch_pages--; 2466 | 2467 | return btree_rebalance(bt, src->parent); 2468 | } 2469 | 2470 | #define FILL_THRESHOLD 250 2471 | 2472 | static int 2473 | btree_rebalance(struct btree *bt, struct mpage *mp) 2474 | { 2475 | struct node *node; 2476 | struct mpage *parent; 2477 | struct mpage *root; 2478 | struct mpage *neighbor = NULL; 2479 | indx_t si = 0, di = 0; 2480 | 2481 | assert(bt != NULL); 2482 | assert(bt->txn != NULL); 2483 | assert(mp != NULL); 2484 | 2485 | DPRINTF("rebalancing %s page %u (has %lu keys, %.1f%% full)", 2486 | IS_LEAF(mp) ? "leaf" : "branch", 2487 | mp->pgno, NUMKEYS(mp), (float)PAGEFILL(bt, mp) / 10); 2488 | 2489 | if (PAGEFILL(bt, mp) >= FILL_THRESHOLD) { 2490 | DPRINTF("no need to rebalance page %u, above fill threshold", 2491 | mp->pgno); 2492 | return BT_SUCCESS; 2493 | } 2494 | 2495 | parent = mp->parent; 2496 | 2497 | if (parent == NULL) { 2498 | if (NUMKEYS(mp) == 0) { 2499 | DPRINTF("tree is completely empty"); 2500 | bt->txn->root = P_INVALID; 2501 | bt->meta.depth--; 2502 | bt->meta.leaf_pages--; 2503 | } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) { 2504 | DPRINTF("collapsing root page!"); 2505 | bt->txn->root = NODEPGNO(NODEPTR(mp, 0)); 2506 | if ((root = btree_get_mpage(bt, bt->txn->root)) == NULL) 2507 | return BT_FAIL; 2508 | root->parent = NULL; 2509 | bt->meta.depth--; 2510 | bt->meta.branch_pages--; 2511 | } else 2512 | DPRINTF("root page doesn't need rebalancing"); 2513 | return BT_SUCCESS; 2514 | } 2515 | 2516 | /* The parent (branch page) must have at least 2 pointers, 2517 | * otherwise the tree is invalid. 2518 | */ 2519 | assert(NUMKEYS(parent) > 1); 2520 | 2521 | /* Leaf page fill factor is below the threshold. 2522 | * Try to move keys from left or right neighbor, or 2523 | * merge with a neighbor page. 2524 | */ 2525 | 2526 | /* Find neighbors. 2527 | */ 2528 | if (mp->parent_index == 0) { 2529 | /* We're the leftmost leaf in our parent. 2530 | */ 2531 | DPRINTF("reading right neighbor"); 2532 | node = NODEPTR(parent, mp->parent_index + 1); 2533 | if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2534 | return BT_FAIL; 2535 | neighbor->parent_index = mp->parent_index + 1; 2536 | si = 0; 2537 | di = NUMKEYS(mp); 2538 | } else { 2539 | /* There is at least one neighbor to the left. 2540 | */ 2541 | DPRINTF("reading left neighbor"); 2542 | node = NODEPTR(parent, mp->parent_index - 1); 2543 | if ((neighbor = btree_get_mpage(bt, NODEPGNO(node))) == NULL) 2544 | return BT_FAIL; 2545 | neighbor->parent_index = mp->parent_index - 1; 2546 | si = NUMKEYS(neighbor) - 1; 2547 | di = 0; 2548 | } 2549 | neighbor->parent = parent; 2550 | 2551 | DPRINTF("found neighbor page %u (%lu keys, %.1f%% full)", 2552 | neighbor->pgno, NUMKEYS(neighbor), (float)PAGEFILL(bt, neighbor) / 10); 2553 | 2554 | /* If the neighbor page is above threshold and has at least two 2555 | * keys, move one key from it. 2556 | * 2557 | * Otherwise we should try to merge them, but that might not be 2558 | * possible, even if both are below threshold, as prefix expansion 2559 | * might make keys larger. FIXME: detect this 2560 | */ 2561 | if (PAGEFILL(bt, neighbor) >= FILL_THRESHOLD && NUMKEYS(neighbor) >= 2) 2562 | return btree_move_node(bt, neighbor, si, mp, di); 2563 | else { /* FIXME: if (has_enough_room()) */ 2564 | if (mp->parent_index == 0) 2565 | return btree_merge(bt, neighbor, mp); 2566 | else 2567 | return btree_merge(bt, mp, neighbor); 2568 | } 2569 | } 2570 | 2571 | int 2572 | btree_txn_del(struct btree *bt, struct btree_txn *txn, 2573 | struct btval *key, struct btval *data) 2574 | { 2575 | int rc, exact, close_txn = 0; 2576 | unsigned int ki; 2577 | struct node *leaf; 2578 | struct mpage *mp; 2579 | 2580 | DPRINTF("========> delete key %.*s", (int)key->size, (char *)key->data); 2581 | 2582 | assert(key != NULL); 2583 | 2584 | if (bt != NULL && txn != NULL && bt != txn->bt) { 2585 | errno = EINVAL; 2586 | return BT_FAIL; 2587 | } 2588 | 2589 | if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2590 | errno = EINVAL; 2591 | return BT_FAIL; 2592 | } 2593 | 2594 | if (bt == NULL) { 2595 | if (txn == NULL) { 2596 | errno = EINVAL; 2597 | return BT_FAIL; 2598 | } 2599 | bt = txn->bt; 2600 | } 2601 | 2602 | if (key->size == 0 || key->size > MAXKEYSIZE) { 2603 | errno = EINVAL; 2604 | return BT_FAIL; 2605 | } 2606 | 2607 | if (txn == NULL) { 2608 | close_txn = 1; 2609 | if ((txn = btree_txn_begin(bt, 0)) == NULL) 2610 | return BT_FAIL; 2611 | } 2612 | 2613 | if ((rc = btree_search_page(bt, txn, key, NULL, 1, &mp)) != BT_SUCCESS) 2614 | goto done; 2615 | 2616 | leaf = btree_search_node(bt, mp, key, &exact, &ki); 2617 | if (leaf == NULL || !exact) { 2618 | errno = ENOENT; 2619 | rc = BT_FAIL; 2620 | goto done; 2621 | } 2622 | 2623 | if (data && (rc = btree_read_data(bt, NULL, leaf, data)) != BT_SUCCESS) 2624 | goto done; 2625 | 2626 | btree_del_node(mp, ki); 2627 | bt->meta.entries--; 2628 | rc = btree_rebalance(bt, mp); 2629 | if (rc != BT_SUCCESS) 2630 | txn->flags |= BT_TXN_ERROR; 2631 | 2632 | done: 2633 | if (close_txn) { 2634 | if (rc == BT_SUCCESS) 2635 | rc = btree_txn_commit(txn); 2636 | else 2637 | btree_txn_abort(txn); 2638 | } 2639 | mpage_prune(bt); 2640 | return rc; 2641 | } 2642 | 2643 | /* Reduce the length of the prefix separator to the minimum length that 2644 | * still makes it uniquely distinguishable from . 2645 | * 2646 | * is guaranteed to be sorted less than 2647 | * 2648 | * On return, is modified to the minimum length. 2649 | */ 2650 | static void 2651 | bt_reduce_separator(struct btree *bt, struct node *min, struct btval *sep) 2652 | { 2653 | size_t n = 0; 2654 | char *p1; 2655 | char *p2; 2656 | 2657 | if (F_ISSET(bt->flags, BT_REVERSEKEY)) { 2658 | 2659 | assert(sep->size > 0); 2660 | 2661 | p1 = (char *)NODEKEY(min) + min->ksize - 1; 2662 | p2 = (char *)sep->data + sep->size - 1; 2663 | 2664 | while (p1 >= (char *)NODEKEY(min) && *p1 == *p2) { 2665 | assert(p2 > (char *)sep->data); 2666 | p1--; 2667 | p2--; 2668 | n++; 2669 | } 2670 | 2671 | sep->data = p2; 2672 | sep->size = n + 1; 2673 | } else { 2674 | 2675 | assert(min->ksize > 0); 2676 | assert(sep->size > 0); 2677 | 2678 | p1 = (char *)NODEKEY(min); 2679 | p2 = (char *)sep->data; 2680 | 2681 | while (*p1 == *p2) { 2682 | p1++; 2683 | p2++; 2684 | n++; 2685 | if (n == min->ksize || n == sep->size) 2686 | break; 2687 | } 2688 | 2689 | sep->size = n + 1; 2690 | } 2691 | 2692 | DPRINTF("reduced separator to [%.*s] > [%.*s]", 2693 | (int)sep->size, (char *)sep->data, 2694 | (int)min->ksize, (char *)NODEKEY(min)); 2695 | } 2696 | 2697 | /* Split page <*mpp>, and insert in either left or 2698 | * right sibling, at index <*newindxp> (as if unsplit). Updates *mpp and 2699 | * *newindxp with the actual values after split, ie if *mpp and *newindxp 2700 | * refer to a node in the new right sibling page. 2701 | */ 2702 | static int 2703 | btree_split(struct btree *bt, struct mpage **mpp, unsigned int *newindxp, 2704 | struct btval *newkey, struct btval *newdata, pgno_t newpgno) 2705 | { 2706 | uint8_t flags; 2707 | int rc = BT_SUCCESS, ins_new = 0; 2708 | indx_t newindx; 2709 | pgno_t pgno = 0; 2710 | size_t orig_pfx_len, left_pfx_diff, right_pfx_diff, pfx_diff; 2711 | unsigned int i, j, split_indx; 2712 | struct node *node; 2713 | struct mpage *pright, *p, *mp; 2714 | struct btval sepkey, rkey, rdata; 2715 | struct btkey tmpkey; 2716 | struct page *copy; 2717 | 2718 | assert(bt != NULL); 2719 | assert(bt->txn != NULL); 2720 | 2721 | mp = *mpp; 2722 | newindx = *newindxp; 2723 | 2724 | DPRINTF("-----> splitting %s page %u and adding [%.*s] at index %i", 2725 | IS_LEAF(mp) ? "leaf" : "branch", mp->pgno, 2726 | (int)newkey->size, (char *)newkey->data, *newindxp); 2727 | DPRINTF("page %u has prefix [%.*s]", mp->pgno, 2728 | (int)mp->prefix.len, (char *)mp->prefix.str); 2729 | orig_pfx_len = mp->prefix.len; 2730 | 2731 | if (mp->parent == NULL) { 2732 | if ((mp->parent = btree_new_page(bt, P_BRANCH)) == NULL) 2733 | return BT_FAIL; 2734 | mp->parent_index = 0; 2735 | bt->txn->root = mp->parent->pgno; 2736 | DPRINTF("root split! new root = %u", mp->parent->pgno); 2737 | bt->meta.depth++; 2738 | 2739 | /* Add left (implicit) pointer. */ 2740 | if (btree_add_node(bt, mp->parent, 0, NULL, NULL, 2741 | mp->pgno, 0) != BT_SUCCESS) 2742 | return BT_FAIL; 2743 | } else { 2744 | DPRINTF("parent branch page is %u", mp->parent->pgno); 2745 | } 2746 | 2747 | /* Create a right sibling. */ 2748 | if ((pright = btree_new_page(bt, mp->page->flags)) == NULL) 2749 | return BT_FAIL; 2750 | pright->parent = mp->parent; 2751 | pright->parent_index = mp->parent_index + 1; 2752 | DPRINTF("new right sibling: page %u", pright->pgno); 2753 | 2754 | /* Move half of the keys to the right sibling. */ 2755 | if ((copy = malloc(bt->head.psize)) == NULL) 2756 | return BT_FAIL; 2757 | memcpy(copy, mp->page, bt->head.psize); 2758 | assert(mp->ref == 0); /* XXX */ 2759 | memset(&mp->page->ptrs, 0, bt->head.psize - PAGEHDRSZ); 2760 | mp->page->lower = PAGEHDRSZ; 2761 | mp->page->upper = bt->head.psize; 2762 | 2763 | split_indx = NUMKEYSP(copy) / 2 + 1; 2764 | 2765 | /* First find the separating key between the split pages. 2766 | */ 2767 | memset(&sepkey, 0, sizeof(sepkey)); 2768 | if (newindx == split_indx) { 2769 | sepkey.size = newkey->size; 2770 | sepkey.data = newkey->data; 2771 | remove_prefix(bt, &sepkey, mp->prefix.len); 2772 | } else { 2773 | node = NODEPTRP(copy, split_indx); 2774 | sepkey.size = node->ksize; 2775 | sepkey.data = NODEKEY(node); 2776 | } 2777 | 2778 | if (IS_LEAF(mp) && bt->cmp == NULL) { 2779 | /* Find the smallest separator. */ 2780 | /* Ref: Prefix B-trees, R. Bayer, K. Unterauer, 1977 */ 2781 | node = NODEPTRP(copy, split_indx - 1); 2782 | bt_reduce_separator(bt, node, &sepkey); 2783 | } 2784 | 2785 | /* Fix separator wrt parent prefix. */ 2786 | if (bt->cmp == NULL) { 2787 | tmpkey.len = sizeof(tmpkey.str); 2788 | concat_prefix(bt, mp->prefix.str, mp->prefix.len, 2789 | sepkey.data, sepkey.size, tmpkey.str, &tmpkey.len); 2790 | sepkey.data = tmpkey.str; 2791 | sepkey.size = tmpkey.len; 2792 | } 2793 | 2794 | DPRINTF("separator is [%.*s]", (int)sepkey.size, (char *)sepkey.data); 2795 | 2796 | /* Copy separator key to the parent. 2797 | */ 2798 | if (SIZELEFT(pright->parent) < bt_branch_size(bt, &sepkey)) { 2799 | rc = btree_split(bt, &pright->parent, &pright->parent_index, 2800 | &sepkey, NULL, pright->pgno); 2801 | 2802 | /* Right page might now have changed parent. 2803 | * Check if left page also changed parent. 2804 | */ 2805 | if (pright->parent != mp->parent && 2806 | mp->parent_index >= NUMKEYS(mp->parent)) { 2807 | mp->parent = pright->parent; 2808 | mp->parent_index = pright->parent_index - 1; 2809 | } 2810 | } else { 2811 | remove_prefix(bt, &sepkey, pright->parent->prefix.len); 2812 | rc = btree_add_node(bt, pright->parent, pright->parent_index, 2813 | &sepkey, NULL, pright->pgno, 0); 2814 | } 2815 | if (rc != BT_SUCCESS) { 2816 | free(copy); 2817 | return BT_FAIL; 2818 | } 2819 | 2820 | /* Update prefix for right and left page, if the parent was split. 2821 | */ 2822 | find_common_prefix(bt, pright); 2823 | assert(orig_pfx_len <= pright->prefix.len); 2824 | right_pfx_diff = pright->prefix.len - orig_pfx_len; 2825 | 2826 | find_common_prefix(bt, mp); 2827 | assert(orig_pfx_len <= mp->prefix.len); 2828 | left_pfx_diff = mp->prefix.len - orig_pfx_len; 2829 | 2830 | for (i = j = 0; i <= NUMKEYSP(copy); j++) { 2831 | if (i < split_indx) { 2832 | /* Re-insert in left sibling. */ 2833 | p = mp; 2834 | pfx_diff = left_pfx_diff; 2835 | } else { 2836 | /* Insert in right sibling. */ 2837 | if (i == split_indx) 2838 | /* Reset insert index for right sibling. */ 2839 | j = (i == newindx && ins_new); 2840 | p = pright; 2841 | pfx_diff = right_pfx_diff; 2842 | } 2843 | 2844 | if (i == newindx && !ins_new) { 2845 | /* Insert the original entry that caused the split. */ 2846 | rkey.data = newkey->data; 2847 | rkey.size = newkey->size; 2848 | if (IS_LEAF(mp)) { 2849 | rdata.data = newdata->data; 2850 | rdata.size = newdata->size; 2851 | } else 2852 | pgno = newpgno; 2853 | flags = 0; 2854 | pfx_diff = p->prefix.len; 2855 | 2856 | ins_new = 1; 2857 | 2858 | /* Update page and index for the new key. */ 2859 | *newindxp = j; 2860 | *mpp = p; 2861 | } else if (i == NUMKEYSP(copy)) { 2862 | break; 2863 | } else { 2864 | node = NODEPTRP(copy, i); 2865 | rkey.data = NODEKEY(node); 2866 | rkey.size = node->ksize; 2867 | if (IS_LEAF(mp)) { 2868 | rdata.data = NODEDATA(node); 2869 | rdata.size = node->n_dsize; 2870 | } else 2871 | pgno = node->n_pgno; 2872 | flags = node->flags; 2873 | 2874 | i++; 2875 | } 2876 | 2877 | if (!IS_LEAF(mp) && j == 0) { 2878 | /* First branch index doesn't need key data. */ 2879 | rkey.size = 0; 2880 | } else 2881 | remove_prefix(bt, &rkey, pfx_diff); 2882 | 2883 | rc = btree_add_node(bt, p, j, &rkey, &rdata, pgno,flags); 2884 | } 2885 | 2886 | free(copy); 2887 | return rc; 2888 | } 2889 | 2890 | int 2891 | btree_txn_put(struct btree *bt, struct btree_txn *txn, 2892 | struct btval *key, struct btval *data, unsigned int flags) 2893 | { 2894 | int rc = BT_SUCCESS, exact, close_txn = 0; 2895 | unsigned int ki; 2896 | struct node *leaf; 2897 | struct mpage *mp; 2898 | struct btval xkey; 2899 | 2900 | assert(key != NULL); 2901 | assert(data != NULL); 2902 | 2903 | if (bt != NULL && txn != NULL && bt != txn->bt) { 2904 | errno = EINVAL; 2905 | return BT_FAIL; 2906 | } 2907 | 2908 | if (txn != NULL && F_ISSET(txn->flags, BT_TXN_RDONLY)) { 2909 | errno = EINVAL; 2910 | return BT_FAIL; 2911 | } 2912 | 2913 | if (bt == NULL) { 2914 | if (txn == NULL) { 2915 | errno = EINVAL; 2916 | return BT_FAIL; 2917 | } 2918 | bt = txn->bt; 2919 | } 2920 | 2921 | if (key->size == 0 || key->size > MAXKEYSIZE) { 2922 | errno = EINVAL; 2923 | return BT_FAIL; 2924 | } 2925 | 2926 | DPRINTF("==> put key %.*s, size %zu, data size %zu", 2927 | (int)key->size, (char *)key->data, key->size, data->size); 2928 | 2929 | if (txn == NULL) { 2930 | close_txn = 1; 2931 | if ((txn = btree_txn_begin(bt, 0)) == NULL) 2932 | return BT_FAIL; 2933 | } 2934 | 2935 | rc = btree_search_page(bt, txn, key, NULL, 1, &mp); 2936 | if (rc == BT_SUCCESS) { 2937 | leaf = btree_search_node(bt, mp, key, &exact, &ki); 2938 | if (leaf && exact) { 2939 | if (F_ISSET(flags, BT_NOOVERWRITE)) { 2940 | DPRINTF("duplicate key %.*s", 2941 | (int)key->size, (char *)key->data); 2942 | errno = EEXIST; 2943 | rc = BT_FAIL; 2944 | goto done; 2945 | } 2946 | btree_del_node(mp, ki); 2947 | } 2948 | if (leaf == NULL) { /* append if not found */ 2949 | ki = NUMKEYS(mp); 2950 | DPRINTF("appending key at index %i", ki); 2951 | } 2952 | } else if (errno == ENOENT) { 2953 | /* new file, just write a root leaf page */ 2954 | DPRINTF("allocating new root leaf page"); 2955 | if ((mp = btree_new_page(bt, P_LEAF)) == NULL) { 2956 | rc = BT_FAIL; 2957 | goto done; 2958 | } 2959 | txn->root = mp->pgno; 2960 | bt->meta.depth++; 2961 | ki = 0; 2962 | } 2963 | else 2964 | goto done; 2965 | 2966 | assert(IS_LEAF(mp)); 2967 | DPRINTF("there are %lu keys, should insert new key at index %i", 2968 | NUMKEYS(mp), ki); 2969 | 2970 | /* Copy the key pointer as it is modified by the prefix code. The 2971 | * caller might have malloc'ed the data. 2972 | */ 2973 | xkey.data = key->data; 2974 | xkey.size = key->size; 2975 | 2976 | if (SIZELEFT(mp) < bt_leaf_size(bt, key, data)) { 2977 | rc = btree_split(bt, &mp, &ki, &xkey, data, P_INVALID); 2978 | } else { 2979 | /* There is room already in this leaf page. */ 2980 | remove_prefix(bt, &xkey, mp->prefix.len); 2981 | rc = btree_add_node(bt, mp, ki, &xkey, data, 0, 0); 2982 | } 2983 | 2984 | if (rc != BT_SUCCESS) 2985 | txn->flags |= BT_TXN_ERROR; 2986 | else 2987 | bt->meta.entries++; 2988 | 2989 | done: 2990 | if (close_txn) { 2991 | if (rc == BT_SUCCESS) 2992 | rc = btree_txn_commit(txn); 2993 | else 2994 | btree_txn_abort(txn); 2995 | } 2996 | mpage_prune(bt); 2997 | return rc; 2998 | } 2999 | 3000 | static pgno_t 3001 | btree_compact_tree(struct btree *bt, pgno_t pgno, struct btree *btc) 3002 | { 3003 | ssize_t rc; 3004 | indx_t i; 3005 | pgno_t *pnext, next; 3006 | struct node *node; 3007 | struct page *p; 3008 | struct mpage *mp; 3009 | 3010 | /* Get the page and make a copy of it. 3011 | */ 3012 | if ((mp = btree_get_mpage(bt, pgno)) == NULL) 3013 | return P_INVALID; 3014 | if ((p = malloc(bt->head.psize)) == NULL) 3015 | return P_INVALID; 3016 | memcpy(p, mp->page, bt->head.psize); 3017 | 3018 | /* Go through all nodes in the (copied) page and update the 3019 | * page pointers. 3020 | */ 3021 | if (F_ISSET(p->flags, P_BRANCH)) { 3022 | for (i = 0; i < NUMKEYSP(p); i++) { 3023 | node = NODEPTRP(p, i); 3024 | node->n_pgno = btree_compact_tree(bt, node->n_pgno, btc); 3025 | if (node->n_pgno == P_INVALID) { 3026 | free(p); 3027 | return P_INVALID; 3028 | } 3029 | } 3030 | } else if (F_ISSET(p->flags, P_LEAF)) { 3031 | for (i = 0; i < NUMKEYSP(p); i++) { 3032 | node = NODEPTRP(p, i); 3033 | if (F_ISSET(node->flags, F_BIGDATA)) { 3034 | memcpy(&next, NODEDATA(node), sizeof(next)); 3035 | next = btree_compact_tree(bt, next, btc); 3036 | if (next == P_INVALID) { 3037 | free(p); 3038 | return P_INVALID; 3039 | } 3040 | memcpy(NODEDATA(node), &next, sizeof(next)); 3041 | } 3042 | } 3043 | } else if (F_ISSET(p->flags, P_OVERFLOW)) { 3044 | pnext = &p->p_next_pgno; 3045 | if (*pnext > 0) { 3046 | *pnext = btree_compact_tree(bt, *pnext, btc); 3047 | if (*pnext == P_INVALID) { 3048 | free(p); 3049 | return P_INVALID; 3050 | } 3051 | } 3052 | } else 3053 | assert(0); 3054 | 3055 | pgno = p->pgno = btc->txn->next_pgno++; 3056 | rc = write(btc->fd, p, bt->head.psize); 3057 | free(p); 3058 | if (rc != (ssize_t)bt->head.psize) 3059 | return P_INVALID; 3060 | mpage_prune(bt); 3061 | return pgno; 3062 | } 3063 | 3064 | int 3065 | btree_compact(struct btree *bt) 3066 | { 3067 | char *compact_path = NULL; 3068 | struct btree *btc; 3069 | struct btree_txn *txn, *txnc = NULL; 3070 | int fd; 3071 | pgno_t root; 3072 | 3073 | assert(bt != NULL); 3074 | 3075 | DPRINTF("compacting btree %p with path %s", bt, bt->path); 3076 | 3077 | if (bt->path == NULL) { 3078 | errno = EINVAL; 3079 | return BT_FAIL; 3080 | } 3081 | 3082 | if ((txn = btree_txn_begin(bt, 0)) == NULL) 3083 | return BT_FAIL; 3084 | 3085 | if (asprintf(&compact_path, "%s.compact.XXXXXX", bt->path) == -1) { 3086 | btree_txn_abort(txn); 3087 | return BT_FAIL; 3088 | } 3089 | fd = mkstemp(compact_path); 3090 | if (fd == -1) { 3091 | free(compact_path); 3092 | btree_txn_abort(txn); 3093 | return BT_FAIL; 3094 | } 3095 | 3096 | if ((btc = btree_open_fd(fd, 0)) == NULL) 3097 | goto failed; 3098 | memcpy(&btc->meta, &bt->meta, sizeof(bt->meta)); 3099 | btc->meta.revisions = 0; 3100 | 3101 | if ((txnc = btree_txn_begin(btc, 0)) == NULL) 3102 | goto failed; 3103 | 3104 | if (bt->meta.root != P_INVALID) { 3105 | root = btree_compact_tree(bt, bt->meta.root, btc); 3106 | if (root == P_INVALID) 3107 | goto failed; 3108 | if (btree_write_meta(btc, root, 0) != BT_SUCCESS) 3109 | goto failed; 3110 | } 3111 | 3112 | fsync(fd); 3113 | 3114 | DPRINTF("renaming %s to %s", compact_path, bt->path); 3115 | if (rename(compact_path, bt->path) != 0) 3116 | goto failed; 3117 | 3118 | /* Write a "tombstone" meta page so other processes can pick up 3119 | * the change and re-open the file. 3120 | */ 3121 | if (btree_write_meta(bt, P_INVALID, BT_TOMBSTONE) != BT_SUCCESS) 3122 | goto failed; 3123 | 3124 | btree_txn_abort(txn); 3125 | btree_txn_abort(txnc); 3126 | free(compact_path); 3127 | btree_close(btc); 3128 | mpage_prune(bt); 3129 | return 0; 3130 | 3131 | failed: 3132 | btree_txn_abort(txn); 3133 | btree_txn_abort(txnc); 3134 | unlink(compact_path); 3135 | free(compact_path); 3136 | btree_close(btc); 3137 | mpage_prune(bt); 3138 | return BT_FAIL; 3139 | } 3140 | 3141 | /* Reverts the last change. Truncates the file at the last root page. 3142 | */ 3143 | int 3144 | btree_revert(struct btree *bt) 3145 | { 3146 | if (btree_read_meta(bt, NULL) != 0) 3147 | return -1; 3148 | 3149 | DPRINTF("truncating file at page %u", bt->meta.root); 3150 | return ftruncate(bt->fd, bt->head.psize * bt->meta.root); 3151 | } 3152 | 3153 | void 3154 | btree_set_cache_size(struct btree *bt, unsigned int cache_size) 3155 | { 3156 | bt->stat.max_cache = cache_size; 3157 | } 3158 | 3159 | unsigned int 3160 | btree_get_flags(struct btree *bt) 3161 | { 3162 | return (bt->flags & ~BT_FIXPADDING); 3163 | } 3164 | 3165 | const char * 3166 | btree_get_path(struct btree *bt) 3167 | { 3168 | return bt->path; 3169 | } 3170 | 3171 | const struct btree_stat * 3172 | btree_stat(struct btree *bt) 3173 | { 3174 | if (bt == NULL) 3175 | return NULL; 3176 | 3177 | bt->stat.branch_pages = bt->meta.branch_pages; 3178 | bt->stat.leaf_pages = bt->meta.leaf_pages; 3179 | bt->stat.overflow_pages = bt->meta.overflow_pages; 3180 | bt->stat.revisions = bt->meta.revisions; 3181 | bt->stat.depth = bt->meta.depth; 3182 | bt->stat.entries = bt->meta.entries; 3183 | bt->stat.psize = bt->head.psize; 3184 | bt->stat.created_at = bt->meta.created_at; 3185 | 3186 | return &bt->stat; 3187 | } 3188 | 3189 | -------------------------------------------------------------------------------- /btree.h: -------------------------------------------------------------------------------- 1 | /* $OpenBSD: btree.h,v 1.6 2010/07/02 01:43:00 martinh Exp $ */ 2 | 3 | /* 4 | * Copyright (c) 2009, 2010 Martin Hedenfalk 5 | * 6 | * Permission to use, copy, modify, and distribute this software for any 7 | * purpose with or without fee is hereby granted, provided that the above 8 | * copyright notice and this permission notice appear in all copies. 9 | * 10 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 | */ 18 | 19 | #ifndef _btree_h_ 20 | #define _btree_h_ 21 | 22 | #include 23 | #include 24 | #include "sha1tiny.h" 25 | 26 | struct mpage; 27 | struct cursor; 28 | struct btree_txn; 29 | 30 | struct btval { 31 | void *data; 32 | size_t size; 33 | int free_data; /* true if data malloc'd */ 34 | struct mpage *mp; /* ref'd memory page */ 35 | }; 36 | 37 | typedef int (*bt_cmp_func)(const struct btval *a, 38 | const struct btval *b); 39 | typedef void (*bt_prefix_func)(const struct btval *a, 40 | const struct btval *b, 41 | struct btval *sep); 42 | 43 | #define BT_NOOVERWRITE 1 44 | 45 | enum cursor_op { /* cursor operations */ 46 | BT_CURSOR, /* position at given key */ 47 | BT_CURSOR_EXACT, /* position at key, or fail */ 48 | BT_FIRST, 49 | BT_NEXT, 50 | BT_LAST, /* not implemented */ 51 | BT_PREV /* not implemented */ 52 | }; 53 | 54 | /* return codes */ 55 | #define BT_FAIL -1 56 | #define BT_SUCCESS 0 57 | 58 | /* btree flags */ 59 | #define BT_NOSYNC 0x02 /* don't fsync after commit */ 60 | #define BT_RDONLY 0x04 /* read only */ 61 | #define BT_REVERSEKEY 0x08 /* use reverse string keys */ 62 | 63 | struct btree_stat { 64 | unsigned long long int hits; /* cache hits */ 65 | unsigned long long int reads; /* page reads */ 66 | unsigned int max_cache; /* max cached pages */ 67 | unsigned int cache_size; /* current cache size */ 68 | unsigned int branch_pages; 69 | unsigned int leaf_pages; 70 | unsigned int overflow_pages; 71 | unsigned int revisions; 72 | unsigned int depth; 73 | unsigned long long int entries; 74 | unsigned int psize; 75 | time_t created_at; 76 | }; 77 | 78 | struct btree *btree_open_fd(int fd, unsigned int flags); 79 | struct btree *btree_open(const char *path, unsigned int flags, 80 | mode_t mode); 81 | void btree_close(struct btree *bt); 82 | const struct btree_stat *btree_stat(struct btree *bt); 83 | 84 | struct btree_txn *btree_txn_begin(struct btree *bt, int rdonly); 85 | int btree_txn_commit(struct btree_txn *txn); 86 | void btree_txn_abort(struct btree_txn *txn); 87 | 88 | int btree_txn_get(struct btree *bt, struct btree_txn *txn, 89 | struct btval *key, struct btval *data); 90 | int btree_txn_put(struct btree *bt, struct btree_txn *txn, 91 | struct btval *key, struct btval *data, 92 | unsigned int flags); 93 | int btree_txn_del(struct btree *bt, struct btree_txn *txn, 94 | struct btval *key, struct btval *data); 95 | 96 | #define btree_get(bt, key, data) \ 97 | btree_txn_get(bt, NULL, key, data) 98 | #define btree_put(bt, key, data, flags) \ 99 | btree_txn_put(bt, NULL, key, data, flags) 100 | #define btree_del(bt, key, data) \ 101 | btree_txn_del(bt, NULL, key, data) 102 | 103 | void btree_set_cache_size(struct btree *bt, 104 | unsigned int cache_size); 105 | unsigned int btree_get_flags(struct btree *bt); 106 | const char *btree_get_path(struct btree *bt); 107 | 108 | #define btree_cursor_open(bt) \ 109 | btree_txn_cursor_open(bt, NULL) 110 | struct cursor *btree_txn_cursor_open(struct btree *bt, 111 | struct btree_txn *txn); 112 | void btree_cursor_close(struct cursor *cursor); 113 | int btree_cursor_get(struct cursor *cursor, 114 | struct btval *key, struct btval *data, 115 | enum cursor_op op); 116 | 117 | int btree_sync(struct btree *bt); 118 | int btree_compact(struct btree *bt); 119 | int btree_revert(struct btree *bt); 120 | 121 | int btree_cmp(struct btree *bt, const struct btval *a, 122 | const struct btval *b); 123 | void btval_reset(struct btval *btv); 124 | 125 | #endif 126 | 127 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | AC_INIT([btree], [0.4.0], [bug-report@cobra-kai.com]) 2 | AC_CANONICAL_HOST 3 | AM_INIT_AUTOMAKE([foreign dist-zip]) 4 | AC_PROG_CC 5 | AC_PROG_RANLIB 6 | AC_CHECK_HEADERS([sys/file.h]) 7 | AC_CONFIG_FILES([Makefile]) 8 | AC_OUTPUT 9 | -------------------------------------------------------------------------------- /include/sys/queue.h: -------------------------------------------------------------------------------- 1 | /* $OpenBSD: queue.h,v 1.32 2007/04/30 18:42:34 pedro Exp $ */ 2 | /* $NetBSD: queue.h,v 1.11 1996/05/16 05:17:14 mycroft Exp $ */ 3 | 4 | /* 5 | * Copyright (c) 1991, 1993 6 | * The Regents of the University of California. All rights reserved. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions 10 | * are met: 11 | * 1. Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * 2. Redistributions in binary form must reproduce the above copyright 14 | * notice, this list of conditions and the following disclaimer in the 15 | * documentation and/or other materials provided with the distribution. 16 | * 3. Neither the name of the University nor the names of its contributors 17 | * may be used to endorse or promote products derived from this software 18 | * without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 | * SUCH DAMAGE. 31 | * 32 | * @(#)queue.h 8.5 (Berkeley) 8/20/94 33 | */ 34 | 35 | #ifndef _SYS_QUEUE_H_ 36 | #define _SYS_QUEUE_H_ 37 | 38 | /* 39 | * This file defines five types of data structures: singly-linked lists, 40 | * lists, simple queues, tail queues, and circular queues. 41 | * 42 | * 43 | * A singly-linked list is headed by a single forward pointer. The elements 44 | * are singly linked for minimum space and pointer manipulation overhead at 45 | * the expense of O(n) removal for arbitrary elements. New elements can be 46 | * added to the list after an existing element or at the head of the list. 47 | * Elements being removed from the head of the list should use the explicit 48 | * macro for this purpose for optimum efficiency. A singly-linked list may 49 | * only be traversed in the forward direction. Singly-linked lists are ideal 50 | * for applications with large datasets and few or no removals or for 51 | * implementing a LIFO queue. 52 | * 53 | * A list is headed by a single forward pointer (or an array of forward 54 | * pointers for a hash table header). The elements are doubly linked 55 | * so that an arbitrary element can be removed without a need to 56 | * traverse the list. New elements can be added to the list before 57 | * or after an existing element or at the head of the list. A list 58 | * may only be traversed in the forward direction. 59 | * 60 | * A simple queue is headed by a pair of pointers, one the head of the 61 | * list and the other to the tail of the list. The elements are singly 62 | * linked to save space, so elements can only be removed from the 63 | * head of the list. New elements can be added to the list before or after 64 | * an existing element, at the head of the list, or at the end of the 65 | * list. A simple queue may only be traversed in the forward direction. 66 | * 67 | * A tail queue is headed by a pair of pointers, one to the head of the 68 | * list and the other to the tail of the list. The elements are doubly 69 | * linked so that an arbitrary element can be removed without a need to 70 | * traverse the list. New elements can be added to the list before or 71 | * after an existing element, at the head of the list, or at the end of 72 | * the list. A tail queue may be traversed in either direction. 73 | * 74 | * A circle queue is headed by a pair of pointers, one to the head of the 75 | * list and the other to the tail of the list. The elements are doubly 76 | * linked so that an arbitrary element can be removed without a need to 77 | * traverse the list. New elements can be added to the list before or after 78 | * an existing element, at the head of the list, or at the end of the list. 79 | * A circle queue may be traversed in either direction, but has a more 80 | * complex end of list detection. 81 | * 82 | * For details on the use of these macros, see the queue(3) manual page. 83 | */ 84 | 85 | #if defined(QUEUE_MACRO_DEBUG) || (defined(_KERNEL) && defined(DIAGNOSTIC)) 86 | #define _Q_INVALIDATE(a) (a) = ((void *)-1) 87 | #else 88 | #define _Q_INVALIDATE(a) 89 | #endif 90 | 91 | /* 92 | * Singly-linked List definitions. 93 | */ 94 | #define SLIST_HEAD(name, type) \ 95 | struct name { \ 96 | struct type *slh_first; /* first element */ \ 97 | } 98 | 99 | #define SLIST_HEAD_INITIALIZER(head) \ 100 | { NULL } 101 | 102 | #define SLIST_ENTRY(type) \ 103 | struct { \ 104 | struct type *sle_next; /* next element */ \ 105 | } 106 | 107 | /* 108 | * Singly-linked List access methods. 109 | */ 110 | #define SLIST_FIRST(head) ((head)->slh_first) 111 | #define SLIST_END(head) NULL 112 | #define SLIST_EMPTY(head) (SLIST_FIRST(head) == SLIST_END(head)) 113 | #define SLIST_NEXT(elm, field) ((elm)->field.sle_next) 114 | 115 | #define SLIST_FOREACH(var, head, field) \ 116 | for((var) = SLIST_FIRST(head); \ 117 | (var) != SLIST_END(head); \ 118 | (var) = SLIST_NEXT(var, field)) 119 | 120 | #define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ 121 | for ((varp) = &SLIST_FIRST((head)); \ 122 | ((var) = *(varp)) != SLIST_END(head); \ 123 | (varp) = &SLIST_NEXT((var), field)) 124 | 125 | /* 126 | * Singly-linked List functions. 127 | */ 128 | #define SLIST_INIT(head) { \ 129 | SLIST_FIRST(head) = SLIST_END(head); \ 130 | } 131 | 132 | #define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ 133 | (elm)->field.sle_next = (slistelm)->field.sle_next; \ 134 | (slistelm)->field.sle_next = (elm); \ 135 | } while (0) 136 | 137 | #define SLIST_INSERT_HEAD(head, elm, field) do { \ 138 | (elm)->field.sle_next = (head)->slh_first; \ 139 | (head)->slh_first = (elm); \ 140 | } while (0) 141 | 142 | #define SLIST_REMOVE_NEXT(head, elm, field) do { \ 143 | (elm)->field.sle_next = (elm)->field.sle_next->field.sle_next; \ 144 | } while (0) 145 | 146 | #define SLIST_REMOVE_HEAD(head, field) do { \ 147 | (head)->slh_first = (head)->slh_first->field.sle_next; \ 148 | } while (0) 149 | 150 | #define SLIST_REMOVE(head, elm, type, field) do { \ 151 | if ((head)->slh_first == (elm)) { \ 152 | SLIST_REMOVE_HEAD((head), field); \ 153 | } else { \ 154 | struct type *curelm = (head)->slh_first; \ 155 | \ 156 | while (curelm->field.sle_next != (elm)) \ 157 | curelm = curelm->field.sle_next; \ 158 | curelm->field.sle_next = \ 159 | curelm->field.sle_next->field.sle_next; \ 160 | _Q_INVALIDATE((elm)->field.sle_next); \ 161 | } \ 162 | } while (0) 163 | 164 | /* 165 | * List definitions. 166 | */ 167 | #define LIST_HEAD(name, type) \ 168 | struct name { \ 169 | struct type *lh_first; /* first element */ \ 170 | } 171 | 172 | #define LIST_HEAD_INITIALIZER(head) \ 173 | { NULL } 174 | 175 | #define LIST_ENTRY(type) \ 176 | struct { \ 177 | struct type *le_next; /* next element */ \ 178 | struct type **le_prev; /* address of previous next element */ \ 179 | } 180 | 181 | /* 182 | * List access methods 183 | */ 184 | #define LIST_FIRST(head) ((head)->lh_first) 185 | #define LIST_END(head) NULL 186 | #define LIST_EMPTY(head) (LIST_FIRST(head) == LIST_END(head)) 187 | #define LIST_NEXT(elm, field) ((elm)->field.le_next) 188 | 189 | #define LIST_FOREACH(var, head, field) \ 190 | for((var) = LIST_FIRST(head); \ 191 | (var)!= LIST_END(head); \ 192 | (var) = LIST_NEXT(var, field)) 193 | 194 | /* 195 | * List functions. 196 | */ 197 | #define LIST_INIT(head) do { \ 198 | LIST_FIRST(head) = LIST_END(head); \ 199 | } while (0) 200 | 201 | #define LIST_INSERT_AFTER(listelm, elm, field) do { \ 202 | if (((elm)->field.le_next = (listelm)->field.le_next) != NULL) \ 203 | (listelm)->field.le_next->field.le_prev = \ 204 | &(elm)->field.le_next; \ 205 | (listelm)->field.le_next = (elm); \ 206 | (elm)->field.le_prev = &(listelm)->field.le_next; \ 207 | } while (0) 208 | 209 | #define LIST_INSERT_BEFORE(listelm, elm, field) do { \ 210 | (elm)->field.le_prev = (listelm)->field.le_prev; \ 211 | (elm)->field.le_next = (listelm); \ 212 | *(listelm)->field.le_prev = (elm); \ 213 | (listelm)->field.le_prev = &(elm)->field.le_next; \ 214 | } while (0) 215 | 216 | #define LIST_INSERT_HEAD(head, elm, field) do { \ 217 | if (((elm)->field.le_next = (head)->lh_first) != NULL) \ 218 | (head)->lh_first->field.le_prev = &(elm)->field.le_next;\ 219 | (head)->lh_first = (elm); \ 220 | (elm)->field.le_prev = &(head)->lh_first; \ 221 | } while (0) 222 | 223 | #define LIST_REMOVE(elm, field) do { \ 224 | if ((elm)->field.le_next != NULL) \ 225 | (elm)->field.le_next->field.le_prev = \ 226 | (elm)->field.le_prev; \ 227 | *(elm)->field.le_prev = (elm)->field.le_next; \ 228 | _Q_INVALIDATE((elm)->field.le_prev); \ 229 | _Q_INVALIDATE((elm)->field.le_next); \ 230 | } while (0) 231 | 232 | #define LIST_REPLACE(elm, elm2, field) do { \ 233 | if (((elm2)->field.le_next = (elm)->field.le_next) != NULL) \ 234 | (elm2)->field.le_next->field.le_prev = \ 235 | &(elm2)->field.le_next; \ 236 | (elm2)->field.le_prev = (elm)->field.le_prev; \ 237 | *(elm2)->field.le_prev = (elm2); \ 238 | _Q_INVALIDATE((elm)->field.le_prev); \ 239 | _Q_INVALIDATE((elm)->field.le_next); \ 240 | } while (0) 241 | 242 | /* 243 | * Simple queue definitions. 244 | */ 245 | #define SIMPLEQ_HEAD(name, type) \ 246 | struct name { \ 247 | struct type *sqh_first; /* first element */ \ 248 | struct type **sqh_last; /* addr of last next element */ \ 249 | } 250 | 251 | #define SIMPLEQ_HEAD_INITIALIZER(head) \ 252 | { NULL, &(head).sqh_first } 253 | 254 | #define SIMPLEQ_ENTRY(type) \ 255 | struct { \ 256 | struct type *sqe_next; /* next element */ \ 257 | } 258 | 259 | /* 260 | * Simple queue access methods. 261 | */ 262 | #define SIMPLEQ_FIRST(head) ((head)->sqh_first) 263 | #define SIMPLEQ_END(head) NULL 264 | #define SIMPLEQ_EMPTY(head) (SIMPLEQ_FIRST(head) == SIMPLEQ_END(head)) 265 | #define SIMPLEQ_NEXT(elm, field) ((elm)->field.sqe_next) 266 | 267 | #define SIMPLEQ_FOREACH(var, head, field) \ 268 | for((var) = SIMPLEQ_FIRST(head); \ 269 | (var) != SIMPLEQ_END(head); \ 270 | (var) = SIMPLEQ_NEXT(var, field)) 271 | 272 | /* 273 | * Simple queue functions. 274 | */ 275 | #define SIMPLEQ_INIT(head) do { \ 276 | (head)->sqh_first = NULL; \ 277 | (head)->sqh_last = &(head)->sqh_first; \ 278 | } while (0) 279 | 280 | #define SIMPLEQ_INSERT_HEAD(head, elm, field) do { \ 281 | if (((elm)->field.sqe_next = (head)->sqh_first) == NULL) \ 282 | (head)->sqh_last = &(elm)->field.sqe_next; \ 283 | (head)->sqh_first = (elm); \ 284 | } while (0) 285 | 286 | #define SIMPLEQ_INSERT_TAIL(head, elm, field) do { \ 287 | (elm)->field.sqe_next = NULL; \ 288 | *(head)->sqh_last = (elm); \ 289 | (head)->sqh_last = &(elm)->field.sqe_next; \ 290 | } while (0) 291 | 292 | #define SIMPLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ 293 | if (((elm)->field.sqe_next = (listelm)->field.sqe_next) == NULL)\ 294 | (head)->sqh_last = &(elm)->field.sqe_next; \ 295 | (listelm)->field.sqe_next = (elm); \ 296 | } while (0) 297 | 298 | #define SIMPLEQ_REMOVE_HEAD(head, field) do { \ 299 | if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL) \ 300 | (head)->sqh_last = &(head)->sqh_first; \ 301 | } while (0) 302 | 303 | /* 304 | * Tail queue definitions. 305 | */ 306 | #define TAILQ_HEAD(name, type) \ 307 | struct name { \ 308 | struct type *tqh_first; /* first element */ \ 309 | struct type **tqh_last; /* addr of last next element */ \ 310 | } 311 | 312 | #define TAILQ_HEAD_INITIALIZER(head) \ 313 | { NULL, &(head).tqh_first } 314 | 315 | #define TAILQ_ENTRY(type) \ 316 | struct { \ 317 | struct type *tqe_next; /* next element */ \ 318 | struct type **tqe_prev; /* address of previous next element */ \ 319 | } 320 | 321 | /* 322 | * tail queue access methods 323 | */ 324 | #define TAILQ_FIRST(head) ((head)->tqh_first) 325 | #define TAILQ_END(head) NULL 326 | #define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) 327 | #define TAILQ_LAST(head, headname) \ 328 | (*(((struct headname *)((head)->tqh_last))->tqh_last)) 329 | /* XXX */ 330 | #define TAILQ_PREV(elm, headname, field) \ 331 | (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) 332 | #define TAILQ_EMPTY(head) \ 333 | (TAILQ_FIRST(head) == TAILQ_END(head)) 334 | 335 | #define TAILQ_FOREACH(var, head, field) \ 336 | for((var) = TAILQ_FIRST(head); \ 337 | (var) != TAILQ_END(head); \ 338 | (var) = TAILQ_NEXT(var, field)) 339 | 340 | #define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ 341 | for((var) = TAILQ_LAST(head, headname); \ 342 | (var) != TAILQ_END(head); \ 343 | (var) = TAILQ_PREV(var, headname, field)) 344 | 345 | /* 346 | * Tail queue functions. 347 | */ 348 | #define TAILQ_INIT(head) do { \ 349 | (head)->tqh_first = NULL; \ 350 | (head)->tqh_last = &(head)->tqh_first; \ 351 | } while (0) 352 | 353 | #define TAILQ_INSERT_HEAD(head, elm, field) do { \ 354 | if (((elm)->field.tqe_next = (head)->tqh_first) != NULL) \ 355 | (head)->tqh_first->field.tqe_prev = \ 356 | &(elm)->field.tqe_next; \ 357 | else \ 358 | (head)->tqh_last = &(elm)->field.tqe_next; \ 359 | (head)->tqh_first = (elm); \ 360 | (elm)->field.tqe_prev = &(head)->tqh_first; \ 361 | } while (0) 362 | 363 | #define TAILQ_INSERT_TAIL(head, elm, field) do { \ 364 | (elm)->field.tqe_next = NULL; \ 365 | (elm)->field.tqe_prev = (head)->tqh_last; \ 366 | *(head)->tqh_last = (elm); \ 367 | (head)->tqh_last = &(elm)->field.tqe_next; \ 368 | } while (0) 369 | 370 | #define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ 371 | if (((elm)->field.tqe_next = (listelm)->field.tqe_next) != NULL)\ 372 | (elm)->field.tqe_next->field.tqe_prev = \ 373 | &(elm)->field.tqe_next; \ 374 | else \ 375 | (head)->tqh_last = &(elm)->field.tqe_next; \ 376 | (listelm)->field.tqe_next = (elm); \ 377 | (elm)->field.tqe_prev = &(listelm)->field.tqe_next; \ 378 | } while (0) 379 | 380 | #define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ 381 | (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ 382 | (elm)->field.tqe_next = (listelm); \ 383 | *(listelm)->field.tqe_prev = (elm); \ 384 | (listelm)->field.tqe_prev = &(elm)->field.tqe_next; \ 385 | } while (0) 386 | 387 | #define TAILQ_REMOVE(head, elm, field) do { \ 388 | if (((elm)->field.tqe_next) != NULL) \ 389 | (elm)->field.tqe_next->field.tqe_prev = \ 390 | (elm)->field.tqe_prev; \ 391 | else \ 392 | (head)->tqh_last = (elm)->field.tqe_prev; \ 393 | *(elm)->field.tqe_prev = (elm)->field.tqe_next; \ 394 | _Q_INVALIDATE((elm)->field.tqe_prev); \ 395 | _Q_INVALIDATE((elm)->field.tqe_next); \ 396 | } while (0) 397 | 398 | #define TAILQ_REPLACE(head, elm, elm2, field) do { \ 399 | if (((elm2)->field.tqe_next = (elm)->field.tqe_next) != NULL) \ 400 | (elm2)->field.tqe_next->field.tqe_prev = \ 401 | &(elm2)->field.tqe_next; \ 402 | else \ 403 | (head)->tqh_last = &(elm2)->field.tqe_next; \ 404 | (elm2)->field.tqe_prev = (elm)->field.tqe_prev; \ 405 | *(elm2)->field.tqe_prev = (elm2); \ 406 | _Q_INVALIDATE((elm)->field.tqe_prev); \ 407 | _Q_INVALIDATE((elm)->field.tqe_next); \ 408 | } while (0) 409 | 410 | /* 411 | * Circular queue definitions. 412 | */ 413 | #define CIRCLEQ_HEAD(name, type) \ 414 | struct name { \ 415 | struct type *cqh_first; /* first element */ \ 416 | struct type *cqh_last; /* last element */ \ 417 | } 418 | 419 | #define CIRCLEQ_HEAD_INITIALIZER(head) \ 420 | { CIRCLEQ_END(&head), CIRCLEQ_END(&head) } 421 | 422 | #define CIRCLEQ_ENTRY(type) \ 423 | struct { \ 424 | struct type *cqe_next; /* next element */ \ 425 | struct type *cqe_prev; /* previous element */ \ 426 | } 427 | 428 | /* 429 | * Circular queue access methods 430 | */ 431 | #define CIRCLEQ_FIRST(head) ((head)->cqh_first) 432 | #define CIRCLEQ_LAST(head) ((head)->cqh_last) 433 | #define CIRCLEQ_END(head) ((void *)(head)) 434 | #define CIRCLEQ_NEXT(elm, field) ((elm)->field.cqe_next) 435 | #define CIRCLEQ_PREV(elm, field) ((elm)->field.cqe_prev) 436 | #define CIRCLEQ_EMPTY(head) \ 437 | (CIRCLEQ_FIRST(head) == CIRCLEQ_END(head)) 438 | 439 | #define CIRCLEQ_FOREACH(var, head, field) \ 440 | for((var) = CIRCLEQ_FIRST(head); \ 441 | (var) != CIRCLEQ_END(head); \ 442 | (var) = CIRCLEQ_NEXT(var, field)) 443 | 444 | #define CIRCLEQ_FOREACH_REVERSE(var, head, field) \ 445 | for((var) = CIRCLEQ_LAST(head); \ 446 | (var) != CIRCLEQ_END(head); \ 447 | (var) = CIRCLEQ_PREV(var, field)) 448 | 449 | /* 450 | * Circular queue functions. 451 | */ 452 | #define CIRCLEQ_INIT(head) do { \ 453 | (head)->cqh_first = CIRCLEQ_END(head); \ 454 | (head)->cqh_last = CIRCLEQ_END(head); \ 455 | } while (0) 456 | 457 | #define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do { \ 458 | (elm)->field.cqe_next = (listelm)->field.cqe_next; \ 459 | (elm)->field.cqe_prev = (listelm); \ 460 | if ((listelm)->field.cqe_next == CIRCLEQ_END(head)) \ 461 | (head)->cqh_last = (elm); \ 462 | else \ 463 | (listelm)->field.cqe_next->field.cqe_prev = (elm); \ 464 | (listelm)->field.cqe_next = (elm); \ 465 | } while (0) 466 | 467 | #define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do { \ 468 | (elm)->field.cqe_next = (listelm); \ 469 | (elm)->field.cqe_prev = (listelm)->field.cqe_prev; \ 470 | if ((listelm)->field.cqe_prev == CIRCLEQ_END(head)) \ 471 | (head)->cqh_first = (elm); \ 472 | else \ 473 | (listelm)->field.cqe_prev->field.cqe_next = (elm); \ 474 | (listelm)->field.cqe_prev = (elm); \ 475 | } while (0) 476 | 477 | #define CIRCLEQ_INSERT_HEAD(head, elm, field) do { \ 478 | (elm)->field.cqe_next = (head)->cqh_first; \ 479 | (elm)->field.cqe_prev = CIRCLEQ_END(head); \ 480 | if ((head)->cqh_last == CIRCLEQ_END(head)) \ 481 | (head)->cqh_last = (elm); \ 482 | else \ 483 | (head)->cqh_first->field.cqe_prev = (elm); \ 484 | (head)->cqh_first = (elm); \ 485 | } while (0) 486 | 487 | #define CIRCLEQ_INSERT_TAIL(head, elm, field) do { \ 488 | (elm)->field.cqe_next = CIRCLEQ_END(head); \ 489 | (elm)->field.cqe_prev = (head)->cqh_last; \ 490 | if ((head)->cqh_first == CIRCLEQ_END(head)) \ 491 | (head)->cqh_first = (elm); \ 492 | else \ 493 | (head)->cqh_last->field.cqe_next = (elm); \ 494 | (head)->cqh_last = (elm); \ 495 | } while (0) 496 | 497 | #define CIRCLEQ_REMOVE(head, elm, field) do { \ 498 | if ((elm)->field.cqe_next == CIRCLEQ_END(head)) \ 499 | (head)->cqh_last = (elm)->field.cqe_prev; \ 500 | else \ 501 | (elm)->field.cqe_next->field.cqe_prev = \ 502 | (elm)->field.cqe_prev; \ 503 | if ((elm)->field.cqe_prev == CIRCLEQ_END(head)) \ 504 | (head)->cqh_first = (elm)->field.cqe_next; \ 505 | else \ 506 | (elm)->field.cqe_prev->field.cqe_next = \ 507 | (elm)->field.cqe_next; \ 508 | _Q_INVALIDATE((elm)->field.cqe_prev); \ 509 | _Q_INVALIDATE((elm)->field.cqe_next); \ 510 | } while (0) 511 | 512 | #define CIRCLEQ_REPLACE(head, elm, elm2, field) do { \ 513 | if (((elm2)->field.cqe_next = (elm)->field.cqe_next) == \ 514 | CIRCLEQ_END(head)) \ 515 | (head).cqh_last = (elm2); \ 516 | else \ 517 | (elm2)->field.cqe_next->field.cqe_prev = (elm2); \ 518 | if (((elm2)->field.cqe_prev = (elm)->field.cqe_prev) == \ 519 | CIRCLEQ_END(head)) \ 520 | (head).cqh_first = (elm2); \ 521 | else \ 522 | (elm2)->field.cqe_prev->field.cqe_next = (elm2); \ 523 | _Q_INVALIDATE((elm)->field.cqe_prev); \ 524 | _Q_INVALIDATE((elm)->field.cqe_next); \ 525 | } while (0) 526 | 527 | #endif /* !_SYS_QUEUE_H_ */ 528 | -------------------------------------------------------------------------------- /include/sys/tree.h: -------------------------------------------------------------------------------- 1 | /* $OpenBSD: tree.h,v 1.12 2009/03/02 09:42:55 mikeb Exp $ */ 2 | /* 3 | * Copyright 2002 Niels Provos 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions 8 | * are met: 9 | * 1. Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * 2. Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | */ 26 | 27 | #ifndef _SYS_TREE_H_ 28 | #define _SYS_TREE_H_ 29 | 30 | /* 31 | * This file defines data structures for different types of trees: 32 | * splay trees and red-black trees. 33 | * 34 | * A splay tree is a self-organizing data structure. Every operation 35 | * on the tree causes a splay to happen. The splay moves the requested 36 | * node to the root of the tree and partly rebalances it. 37 | * 38 | * This has the benefit that request locality causes faster lookups as 39 | * the requested nodes move to the top of the tree. On the other hand, 40 | * every lookup causes memory writes. 41 | * 42 | * The Balance Theorem bounds the total access time for m operations 43 | * and n inserts on an initially empty tree as O((m + n)lg n). The 44 | * amortized cost for a sequence of m accesses to a splay tree is O(lg n); 45 | * 46 | * A red-black tree is a binary search tree with the node color as an 47 | * extra attribute. It fulfills a set of conditions: 48 | * - every search path from the root to a leaf consists of the 49 | * same number of black nodes, 50 | * - each red node (except for the root) has a black parent, 51 | * - each leaf node is black. 52 | * 53 | * Every operation on a red-black tree is bounded as O(lg n). 54 | * The maximum height of a red-black tree is 2lg (n+1). 55 | */ 56 | 57 | #define SPLAY_HEAD(name, type) \ 58 | struct name { \ 59 | struct type *sph_root; /* root of the tree */ \ 60 | } 61 | 62 | #define SPLAY_INITIALIZER(root) \ 63 | { NULL } 64 | 65 | #define SPLAY_INIT(root) do { \ 66 | (root)->sph_root = NULL; \ 67 | } while (0) 68 | 69 | #define SPLAY_ENTRY(type) \ 70 | struct { \ 71 | struct type *spe_left; /* left element */ \ 72 | struct type *spe_right; /* right element */ \ 73 | } 74 | 75 | #define SPLAY_LEFT(elm, field) (elm)->field.spe_left 76 | #define SPLAY_RIGHT(elm, field) (elm)->field.spe_right 77 | #define SPLAY_ROOT(head) (head)->sph_root 78 | #define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL) 79 | 80 | /* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */ 81 | #define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \ 82 | SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \ 83 | SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ 84 | (head)->sph_root = tmp; \ 85 | } while (0) 86 | 87 | #define SPLAY_ROTATE_LEFT(head, tmp, field) do { \ 88 | SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \ 89 | SPLAY_LEFT(tmp, field) = (head)->sph_root; \ 90 | (head)->sph_root = tmp; \ 91 | } while (0) 92 | 93 | #define SPLAY_LINKLEFT(head, tmp, field) do { \ 94 | SPLAY_LEFT(tmp, field) = (head)->sph_root; \ 95 | tmp = (head)->sph_root; \ 96 | (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \ 97 | } while (0) 98 | 99 | #define SPLAY_LINKRIGHT(head, tmp, field) do { \ 100 | SPLAY_RIGHT(tmp, field) = (head)->sph_root; \ 101 | tmp = (head)->sph_root; \ 102 | (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \ 103 | } while (0) 104 | 105 | #define SPLAY_ASSEMBLE(head, node, left, right, field) do { \ 106 | SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \ 107 | SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\ 108 | SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \ 109 | SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \ 110 | } while (0) 111 | 112 | /* Generates prototypes and inline functions */ 113 | 114 | #define SPLAY_PROTOTYPE(name, type, field, cmp) \ 115 | void name##_SPLAY(struct name *, struct type *); \ 116 | void name##_SPLAY_MINMAX(struct name *, int); \ 117 | struct type *name##_SPLAY_INSERT(struct name *, struct type *); \ 118 | struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \ 119 | \ 120 | /* Finds the node with the same key as elm */ \ 121 | static __inline struct type * \ 122 | name##_SPLAY_FIND(struct name *head, struct type *elm) \ 123 | { \ 124 | if (SPLAY_EMPTY(head)) \ 125 | return(NULL); \ 126 | name##_SPLAY(head, elm); \ 127 | if ((cmp)(elm, (head)->sph_root) == 0) \ 128 | return (head->sph_root); \ 129 | return (NULL); \ 130 | } \ 131 | \ 132 | static __inline struct type * \ 133 | name##_SPLAY_NEXT(struct name *head, struct type *elm) \ 134 | { \ 135 | name##_SPLAY(head, elm); \ 136 | if (SPLAY_RIGHT(elm, field) != NULL) { \ 137 | elm = SPLAY_RIGHT(elm, field); \ 138 | while (SPLAY_LEFT(elm, field) != NULL) { \ 139 | elm = SPLAY_LEFT(elm, field); \ 140 | } \ 141 | } else \ 142 | elm = NULL; \ 143 | return (elm); \ 144 | } \ 145 | \ 146 | static __inline struct type * \ 147 | name##_SPLAY_MIN_MAX(struct name *head, int val) \ 148 | { \ 149 | name##_SPLAY_MINMAX(head, val); \ 150 | return (SPLAY_ROOT(head)); \ 151 | } 152 | 153 | /* Main splay operation. 154 | * Moves node close to the key of elm to top 155 | */ 156 | #define SPLAY_GENERATE(name, type, field, cmp) \ 157 | struct type * \ 158 | name##_SPLAY_INSERT(struct name *head, struct type *elm) \ 159 | { \ 160 | if (SPLAY_EMPTY(head)) { \ 161 | SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \ 162 | } else { \ 163 | int __comp; \ 164 | name##_SPLAY(head, elm); \ 165 | __comp = (cmp)(elm, (head)->sph_root); \ 166 | if(__comp < 0) { \ 167 | SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\ 168 | SPLAY_RIGHT(elm, field) = (head)->sph_root; \ 169 | SPLAY_LEFT((head)->sph_root, field) = NULL; \ 170 | } else if (__comp > 0) { \ 171 | SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\ 172 | SPLAY_LEFT(elm, field) = (head)->sph_root; \ 173 | SPLAY_RIGHT((head)->sph_root, field) = NULL; \ 174 | } else \ 175 | return ((head)->sph_root); \ 176 | } \ 177 | (head)->sph_root = (elm); \ 178 | return (NULL); \ 179 | } \ 180 | \ 181 | struct type * \ 182 | name##_SPLAY_REMOVE(struct name *head, struct type *elm) \ 183 | { \ 184 | struct type *__tmp; \ 185 | if (SPLAY_EMPTY(head)) \ 186 | return (NULL); \ 187 | name##_SPLAY(head, elm); \ 188 | if ((cmp)(elm, (head)->sph_root) == 0) { \ 189 | if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \ 190 | (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\ 191 | } else { \ 192 | __tmp = SPLAY_RIGHT((head)->sph_root, field); \ 193 | (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\ 194 | name##_SPLAY(head, elm); \ 195 | SPLAY_RIGHT((head)->sph_root, field) = __tmp; \ 196 | } \ 197 | return (elm); \ 198 | } \ 199 | return (NULL); \ 200 | } \ 201 | \ 202 | void \ 203 | name##_SPLAY(struct name *head, struct type *elm) \ 204 | { \ 205 | struct type __node, *__left, *__right, *__tmp; \ 206 | int __comp; \ 207 | \ 208 | SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ 209 | __left = __right = &__node; \ 210 | \ 211 | while ((__comp = (cmp)(elm, (head)->sph_root))) { \ 212 | if (__comp < 0) { \ 213 | __tmp = SPLAY_LEFT((head)->sph_root, field); \ 214 | if (__tmp == NULL) \ 215 | break; \ 216 | if ((cmp)(elm, __tmp) < 0){ \ 217 | SPLAY_ROTATE_RIGHT(head, __tmp, field); \ 218 | if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ 219 | break; \ 220 | } \ 221 | SPLAY_LINKLEFT(head, __right, field); \ 222 | } else if (__comp > 0) { \ 223 | __tmp = SPLAY_RIGHT((head)->sph_root, field); \ 224 | if (__tmp == NULL) \ 225 | break; \ 226 | if ((cmp)(elm, __tmp) > 0){ \ 227 | SPLAY_ROTATE_LEFT(head, __tmp, field); \ 228 | if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ 229 | break; \ 230 | } \ 231 | SPLAY_LINKRIGHT(head, __left, field); \ 232 | } \ 233 | } \ 234 | SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ 235 | } \ 236 | \ 237 | /* Splay with either the minimum or the maximum element \ 238 | * Used to find minimum or maximum element in tree. \ 239 | */ \ 240 | void name##_SPLAY_MINMAX(struct name *head, int __comp) \ 241 | { \ 242 | struct type __node, *__left, *__right, *__tmp; \ 243 | \ 244 | SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\ 245 | __left = __right = &__node; \ 246 | \ 247 | while (1) { \ 248 | if (__comp < 0) { \ 249 | __tmp = SPLAY_LEFT((head)->sph_root, field); \ 250 | if (__tmp == NULL) \ 251 | break; \ 252 | if (__comp < 0){ \ 253 | SPLAY_ROTATE_RIGHT(head, __tmp, field); \ 254 | if (SPLAY_LEFT((head)->sph_root, field) == NULL)\ 255 | break; \ 256 | } \ 257 | SPLAY_LINKLEFT(head, __right, field); \ 258 | } else if (__comp > 0) { \ 259 | __tmp = SPLAY_RIGHT((head)->sph_root, field); \ 260 | if (__tmp == NULL) \ 261 | break; \ 262 | if (__comp > 0) { \ 263 | SPLAY_ROTATE_LEFT(head, __tmp, field); \ 264 | if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\ 265 | break; \ 266 | } \ 267 | SPLAY_LINKRIGHT(head, __left, field); \ 268 | } \ 269 | } \ 270 | SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \ 271 | } 272 | 273 | #define SPLAY_NEGINF -1 274 | #define SPLAY_INF 1 275 | 276 | #define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y) 277 | #define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y) 278 | #define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y) 279 | #define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y) 280 | #define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \ 281 | : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF)) 282 | #define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \ 283 | : name##_SPLAY_MIN_MAX(x, SPLAY_INF)) 284 | 285 | #define SPLAY_FOREACH(x, name, head) \ 286 | for ((x) = SPLAY_MIN(name, head); \ 287 | (x) != NULL; \ 288 | (x) = SPLAY_NEXT(name, head, x)) 289 | 290 | /* Macros that define a red-black tree */ 291 | #define RB_HEAD(name, type) \ 292 | struct name { \ 293 | struct type *rbh_root; /* root of the tree */ \ 294 | } 295 | 296 | #define RB_INITIALIZER(root) \ 297 | { NULL } 298 | 299 | #define RB_INIT(root) do { \ 300 | (root)->rbh_root = NULL; \ 301 | } while (0) 302 | 303 | #define RB_BLACK 0 304 | #define RB_RED 1 305 | #define RB_ENTRY(type) \ 306 | struct { \ 307 | struct type *rbe_left; /* left element */ \ 308 | struct type *rbe_right; /* right element */ \ 309 | struct type *rbe_parent; /* parent element */ \ 310 | int rbe_color; /* node color */ \ 311 | } 312 | 313 | #define RB_LEFT(elm, field) (elm)->field.rbe_left 314 | #define RB_RIGHT(elm, field) (elm)->field.rbe_right 315 | #define RB_PARENT(elm, field) (elm)->field.rbe_parent 316 | #define RB_COLOR(elm, field) (elm)->field.rbe_color 317 | #define RB_ROOT(head) (head)->rbh_root 318 | #define RB_EMPTY(head) (RB_ROOT(head) == NULL) 319 | 320 | #define RB_SET(elm, parent, field) do { \ 321 | RB_PARENT(elm, field) = parent; \ 322 | RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \ 323 | RB_COLOR(elm, field) = RB_RED; \ 324 | } while (0) 325 | 326 | #define RB_SET_BLACKRED(black, red, field) do { \ 327 | RB_COLOR(black, field) = RB_BLACK; \ 328 | RB_COLOR(red, field) = RB_RED; \ 329 | } while (0) 330 | 331 | #ifndef RB_AUGMENT 332 | #define RB_AUGMENT(x) do {} while (0) 333 | #endif 334 | 335 | #define RB_ROTATE_LEFT(head, elm, tmp, field) do { \ 336 | (tmp) = RB_RIGHT(elm, field); \ 337 | if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field))) { \ 338 | RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \ 339 | } \ 340 | RB_AUGMENT(elm); \ 341 | if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ 342 | if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ 343 | RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ 344 | else \ 345 | RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ 346 | } else \ 347 | (head)->rbh_root = (tmp); \ 348 | RB_LEFT(tmp, field) = (elm); \ 349 | RB_PARENT(elm, field) = (tmp); \ 350 | RB_AUGMENT(tmp); \ 351 | if ((RB_PARENT(tmp, field))) \ 352 | RB_AUGMENT(RB_PARENT(tmp, field)); \ 353 | } while (0) 354 | 355 | #define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \ 356 | (tmp) = RB_LEFT(elm, field); \ 357 | if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field))) { \ 358 | RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \ 359 | } \ 360 | RB_AUGMENT(elm); \ 361 | if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field))) { \ 362 | if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \ 363 | RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \ 364 | else \ 365 | RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \ 366 | } else \ 367 | (head)->rbh_root = (tmp); \ 368 | RB_RIGHT(tmp, field) = (elm); \ 369 | RB_PARENT(elm, field) = (tmp); \ 370 | RB_AUGMENT(tmp); \ 371 | if ((RB_PARENT(tmp, field))) \ 372 | RB_AUGMENT(RB_PARENT(tmp, field)); \ 373 | } while (0) 374 | 375 | /* Generates prototypes and inline functions */ 376 | #define RB_PROTOTYPE(name, type, field, cmp) \ 377 | RB_PROTOTYPE_INTERNAL(name, type, field, cmp,) 378 | #define RB_PROTOTYPE_STATIC(name, type, field, cmp) \ 379 | RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) 380 | #define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \ 381 | attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \ 382 | attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\ 383 | attr struct type *name##_RB_REMOVE(struct name *, struct type *); \ 384 | attr struct type *name##_RB_INSERT(struct name *, struct type *); \ 385 | attr struct type *name##_RB_FIND(struct name *, struct type *); \ 386 | attr struct type *name##_RB_NFIND(struct name *, struct type *); \ 387 | attr struct type *name##_RB_NEXT(struct type *); \ 388 | attr struct type *name##_RB_PREV(struct type *); \ 389 | attr struct type *name##_RB_MINMAX(struct name *, int); \ 390 | \ 391 | 392 | /* Main rb operation. 393 | * Moves node close to the key of elm to top 394 | */ 395 | #define RB_GENERATE(name, type, field, cmp) \ 396 | RB_GENERATE_INTERNAL(name, type, field, cmp,) 397 | #define RB_GENERATE_STATIC(name, type, field, cmp) \ 398 | RB_GENERATE_INTERNAL(name, type, field, cmp, __attribute__((__unused__)) static) 399 | #define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \ 400 | attr void \ 401 | name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \ 402 | { \ 403 | struct type *parent, *gparent, *tmp; \ 404 | while ((parent = RB_PARENT(elm, field)) && \ 405 | RB_COLOR(parent, field) == RB_RED) { \ 406 | gparent = RB_PARENT(parent, field); \ 407 | if (parent == RB_LEFT(gparent, field)) { \ 408 | tmp = RB_RIGHT(gparent, field); \ 409 | if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ 410 | RB_COLOR(tmp, field) = RB_BLACK; \ 411 | RB_SET_BLACKRED(parent, gparent, field);\ 412 | elm = gparent; \ 413 | continue; \ 414 | } \ 415 | if (RB_RIGHT(parent, field) == elm) { \ 416 | RB_ROTATE_LEFT(head, parent, tmp, field);\ 417 | tmp = parent; \ 418 | parent = elm; \ 419 | elm = tmp; \ 420 | } \ 421 | RB_SET_BLACKRED(parent, gparent, field); \ 422 | RB_ROTATE_RIGHT(head, gparent, tmp, field); \ 423 | } else { \ 424 | tmp = RB_LEFT(gparent, field); \ 425 | if (tmp && RB_COLOR(tmp, field) == RB_RED) { \ 426 | RB_COLOR(tmp, field) = RB_BLACK; \ 427 | RB_SET_BLACKRED(parent, gparent, field);\ 428 | elm = gparent; \ 429 | continue; \ 430 | } \ 431 | if (RB_LEFT(parent, field) == elm) { \ 432 | RB_ROTATE_RIGHT(head, parent, tmp, field);\ 433 | tmp = parent; \ 434 | parent = elm; \ 435 | elm = tmp; \ 436 | } \ 437 | RB_SET_BLACKRED(parent, gparent, field); \ 438 | RB_ROTATE_LEFT(head, gparent, tmp, field); \ 439 | } \ 440 | } \ 441 | RB_COLOR(head->rbh_root, field) = RB_BLACK; \ 442 | } \ 443 | \ 444 | attr void \ 445 | name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \ 446 | { \ 447 | struct type *tmp; \ 448 | while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \ 449 | elm != RB_ROOT(head)) { \ 450 | if (RB_LEFT(parent, field) == elm) { \ 451 | tmp = RB_RIGHT(parent, field); \ 452 | if (RB_COLOR(tmp, field) == RB_RED) { \ 453 | RB_SET_BLACKRED(tmp, parent, field); \ 454 | RB_ROTATE_LEFT(head, parent, tmp, field);\ 455 | tmp = RB_RIGHT(parent, field); \ 456 | } \ 457 | if ((RB_LEFT(tmp, field) == NULL || \ 458 | RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ 459 | (RB_RIGHT(tmp, field) == NULL || \ 460 | RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ 461 | RB_COLOR(tmp, field) = RB_RED; \ 462 | elm = parent; \ 463 | parent = RB_PARENT(elm, field); \ 464 | } else { \ 465 | if (RB_RIGHT(tmp, field) == NULL || \ 466 | RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\ 467 | struct type *oleft; \ 468 | if ((oleft = RB_LEFT(tmp, field)))\ 469 | RB_COLOR(oleft, field) = RB_BLACK;\ 470 | RB_COLOR(tmp, field) = RB_RED; \ 471 | RB_ROTATE_RIGHT(head, tmp, oleft, field);\ 472 | tmp = RB_RIGHT(parent, field); \ 473 | } \ 474 | RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ 475 | RB_COLOR(parent, field) = RB_BLACK; \ 476 | if (RB_RIGHT(tmp, field)) \ 477 | RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\ 478 | RB_ROTATE_LEFT(head, parent, tmp, field);\ 479 | elm = RB_ROOT(head); \ 480 | break; \ 481 | } \ 482 | } else { \ 483 | tmp = RB_LEFT(parent, field); \ 484 | if (RB_COLOR(tmp, field) == RB_RED) { \ 485 | RB_SET_BLACKRED(tmp, parent, field); \ 486 | RB_ROTATE_RIGHT(head, parent, tmp, field);\ 487 | tmp = RB_LEFT(parent, field); \ 488 | } \ 489 | if ((RB_LEFT(tmp, field) == NULL || \ 490 | RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\ 491 | (RB_RIGHT(tmp, field) == NULL || \ 492 | RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\ 493 | RB_COLOR(tmp, field) = RB_RED; \ 494 | elm = parent; \ 495 | parent = RB_PARENT(elm, field); \ 496 | } else { \ 497 | if (RB_LEFT(tmp, field) == NULL || \ 498 | RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\ 499 | struct type *oright; \ 500 | if ((oright = RB_RIGHT(tmp, field)))\ 501 | RB_COLOR(oright, field) = RB_BLACK;\ 502 | RB_COLOR(tmp, field) = RB_RED; \ 503 | RB_ROTATE_LEFT(head, tmp, oright, field);\ 504 | tmp = RB_LEFT(parent, field); \ 505 | } \ 506 | RB_COLOR(tmp, field) = RB_COLOR(parent, field);\ 507 | RB_COLOR(parent, field) = RB_BLACK; \ 508 | if (RB_LEFT(tmp, field)) \ 509 | RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\ 510 | RB_ROTATE_RIGHT(head, parent, tmp, field);\ 511 | elm = RB_ROOT(head); \ 512 | break; \ 513 | } \ 514 | } \ 515 | } \ 516 | if (elm) \ 517 | RB_COLOR(elm, field) = RB_BLACK; \ 518 | } \ 519 | \ 520 | attr struct type * \ 521 | name##_RB_REMOVE(struct name *head, struct type *elm) \ 522 | { \ 523 | struct type *child, *parent, *old = elm; \ 524 | int color; \ 525 | if (RB_LEFT(elm, field) == NULL) \ 526 | child = RB_RIGHT(elm, field); \ 527 | else if (RB_RIGHT(elm, field) == NULL) \ 528 | child = RB_LEFT(elm, field); \ 529 | else { \ 530 | struct type *left; \ 531 | elm = RB_RIGHT(elm, field); \ 532 | while ((left = RB_LEFT(elm, field))) \ 533 | elm = left; \ 534 | child = RB_RIGHT(elm, field); \ 535 | parent = RB_PARENT(elm, field); \ 536 | color = RB_COLOR(elm, field); \ 537 | if (child) \ 538 | RB_PARENT(child, field) = parent; \ 539 | if (parent) { \ 540 | if (RB_LEFT(parent, field) == elm) \ 541 | RB_LEFT(parent, field) = child; \ 542 | else \ 543 | RB_RIGHT(parent, field) = child; \ 544 | RB_AUGMENT(parent); \ 545 | } else \ 546 | RB_ROOT(head) = child; \ 547 | if (RB_PARENT(elm, field) == old) \ 548 | parent = elm; \ 549 | (elm)->field = (old)->field; \ 550 | if (RB_PARENT(old, field)) { \ 551 | if (RB_LEFT(RB_PARENT(old, field), field) == old)\ 552 | RB_LEFT(RB_PARENT(old, field), field) = elm;\ 553 | else \ 554 | RB_RIGHT(RB_PARENT(old, field), field) = elm;\ 555 | RB_AUGMENT(RB_PARENT(old, field)); \ 556 | } else \ 557 | RB_ROOT(head) = elm; \ 558 | RB_PARENT(RB_LEFT(old, field), field) = elm; \ 559 | if (RB_RIGHT(old, field)) \ 560 | RB_PARENT(RB_RIGHT(old, field), field) = elm; \ 561 | if (parent) { \ 562 | left = parent; \ 563 | do { \ 564 | RB_AUGMENT(left); \ 565 | } while ((left = RB_PARENT(left, field))); \ 566 | } \ 567 | goto color; \ 568 | } \ 569 | parent = RB_PARENT(elm, field); \ 570 | color = RB_COLOR(elm, field); \ 571 | if (child) \ 572 | RB_PARENT(child, field) = parent; \ 573 | if (parent) { \ 574 | if (RB_LEFT(parent, field) == elm) \ 575 | RB_LEFT(parent, field) = child; \ 576 | else \ 577 | RB_RIGHT(parent, field) = child; \ 578 | RB_AUGMENT(parent); \ 579 | } else \ 580 | RB_ROOT(head) = child; \ 581 | color: \ 582 | if (color == RB_BLACK) \ 583 | name##_RB_REMOVE_COLOR(head, parent, child); \ 584 | return (old); \ 585 | } \ 586 | \ 587 | /* Inserts a node into the RB tree */ \ 588 | attr struct type * \ 589 | name##_RB_INSERT(struct name *head, struct type *elm) \ 590 | { \ 591 | struct type *tmp; \ 592 | struct type *parent = NULL; \ 593 | int comp = 0; \ 594 | tmp = RB_ROOT(head); \ 595 | while (tmp) { \ 596 | parent = tmp; \ 597 | comp = (cmp)(elm, parent); \ 598 | if (comp < 0) \ 599 | tmp = RB_LEFT(tmp, field); \ 600 | else if (comp > 0) \ 601 | tmp = RB_RIGHT(tmp, field); \ 602 | else \ 603 | return (tmp); \ 604 | } \ 605 | RB_SET(elm, parent, field); \ 606 | if (parent != NULL) { \ 607 | if (comp < 0) \ 608 | RB_LEFT(parent, field) = elm; \ 609 | else \ 610 | RB_RIGHT(parent, field) = elm; \ 611 | RB_AUGMENT(parent); \ 612 | } else \ 613 | RB_ROOT(head) = elm; \ 614 | name##_RB_INSERT_COLOR(head, elm); \ 615 | return (NULL); \ 616 | } \ 617 | \ 618 | /* Finds the node with the same key as elm */ \ 619 | attr struct type * \ 620 | name##_RB_FIND(struct name *head, struct type *elm) \ 621 | { \ 622 | struct type *tmp = RB_ROOT(head); \ 623 | int comp; \ 624 | while (tmp) { \ 625 | comp = cmp(elm, tmp); \ 626 | if (comp < 0) \ 627 | tmp = RB_LEFT(tmp, field); \ 628 | else if (comp > 0) \ 629 | tmp = RB_RIGHT(tmp, field); \ 630 | else \ 631 | return (tmp); \ 632 | } \ 633 | return (NULL); \ 634 | } \ 635 | \ 636 | /* Finds the first node greater than or equal to the search key */ \ 637 | attr struct type * \ 638 | name##_RB_NFIND(struct name *head, struct type *elm) \ 639 | { \ 640 | struct type *tmp = RB_ROOT(head); \ 641 | struct type *res = NULL; \ 642 | int comp; \ 643 | while (tmp) { \ 644 | comp = cmp(elm, tmp); \ 645 | if (comp < 0) { \ 646 | res = tmp; \ 647 | tmp = RB_LEFT(tmp, field); \ 648 | } \ 649 | else if (comp > 0) \ 650 | tmp = RB_RIGHT(tmp, field); \ 651 | else \ 652 | return (tmp); \ 653 | } \ 654 | return (res); \ 655 | } \ 656 | \ 657 | /* ARGSUSED */ \ 658 | attr struct type * \ 659 | name##_RB_NEXT(struct type *elm) \ 660 | { \ 661 | if (RB_RIGHT(elm, field)) { \ 662 | elm = RB_RIGHT(elm, field); \ 663 | while (RB_LEFT(elm, field)) \ 664 | elm = RB_LEFT(elm, field); \ 665 | } else { \ 666 | if (RB_PARENT(elm, field) && \ 667 | (elm == RB_LEFT(RB_PARENT(elm, field), field))) \ 668 | elm = RB_PARENT(elm, field); \ 669 | else { \ 670 | while (RB_PARENT(elm, field) && \ 671 | (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\ 672 | elm = RB_PARENT(elm, field); \ 673 | elm = RB_PARENT(elm, field); \ 674 | } \ 675 | } \ 676 | return (elm); \ 677 | } \ 678 | \ 679 | /* ARGSUSED */ \ 680 | attr struct type * \ 681 | name##_RB_PREV(struct type *elm) \ 682 | { \ 683 | if (RB_LEFT(elm, field)) { \ 684 | elm = RB_LEFT(elm, field); \ 685 | while (RB_RIGHT(elm, field)) \ 686 | elm = RB_RIGHT(elm, field); \ 687 | } else { \ 688 | if (RB_PARENT(elm, field) && \ 689 | (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \ 690 | elm = RB_PARENT(elm, field); \ 691 | else { \ 692 | while (RB_PARENT(elm, field) && \ 693 | (elm == RB_LEFT(RB_PARENT(elm, field), field)))\ 694 | elm = RB_PARENT(elm, field); \ 695 | elm = RB_PARENT(elm, field); \ 696 | } \ 697 | } \ 698 | return (elm); \ 699 | } \ 700 | \ 701 | attr struct type * \ 702 | name##_RB_MINMAX(struct name *head, int val) \ 703 | { \ 704 | struct type *tmp = RB_ROOT(head); \ 705 | struct type *parent = NULL; \ 706 | while (tmp) { \ 707 | parent = tmp; \ 708 | if (val < 0) \ 709 | tmp = RB_LEFT(tmp, field); \ 710 | else \ 711 | tmp = RB_RIGHT(tmp, field); \ 712 | } \ 713 | return (parent); \ 714 | } 715 | 716 | #define RB_NEGINF -1 717 | #define RB_INF 1 718 | 719 | #define RB_INSERT(name, x, y) name##_RB_INSERT(x, y) 720 | #define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y) 721 | #define RB_FIND(name, x, y) name##_RB_FIND(x, y) 722 | #define RB_NFIND(name, x, y) name##_RB_NFIND(x, y) 723 | #define RB_NEXT(name, x, y) name##_RB_NEXT(y) 724 | #define RB_PREV(name, x, y) name##_RB_PREV(y) 725 | #define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF) 726 | #define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF) 727 | 728 | #define RB_FOREACH(x, name, head) \ 729 | for ((x) = RB_MIN(name, head); \ 730 | (x) != NULL; \ 731 | (x) = name##_RB_NEXT(x)) 732 | 733 | #define RB_FOREACH_REVERSE(x, name, head) \ 734 | for ((x) = RB_MAX(name, head); \ 735 | (x) != NULL; \ 736 | (x) = name##_RB_PREV(x)) 737 | 738 | #endif /* _SYS_TREE_H_ */ 739 | -------------------------------------------------------------------------------- /sha1tiny.c: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * SHA1 3 | ******************************************************************************/ 4 | /* SHA-1 hashing routines. 5 | * see RFC3174 for the SHA-1 algorithm. 6 | * 7 | * Jon Mayo - PUBLIC DOMAIN - June 24, 2010 8 | * 9 | */ 10 | #include 11 | #include 12 | #include 13 | #include "sha1tiny.h" 14 | 15 | /* rotate v left by b bits, truncate to 32-bits. */ 16 | #define ROL32(v, b) ((((v) << (b)) | ((v) >> (32 - (b)))) & 0xfffffffful) 17 | 18 | /** 19 | * quick calculation of SHA1 on buffer data. 20 | * data - pointer. 21 | * len - length of data. 22 | * md - cannot be NULL. 23 | * return md. 24 | */ 25 | unsigned char *sha1(const void *data, size_t len, unsigned char md[SHA1_DIGEST_LENGTH]) 26 | { 27 | uint_least32_t h[5] = { 0x67452301lu, 0xefcdab89lu, 0x98badcfelu, 0x10325476lu, 0xc3d2e1f0lu }; 28 | uint_least32_t v[5]; /* called a, b, c, d, e in documentation. */ 29 | uint_least64_t cnt = 0; /* total so far in bits. */ 30 | uint_least32_t chunk[80]; /* 0..15 is 512-bit of data. 16..79 is copies. */ 31 | unsigned i, tail_fl = 0, length_fl = 0; 32 | 33 | while (tail_fl || len) { 34 | /* fill in chunk[0..15] using data[+0..+63] */ 35 | for (i = 0; i < 512 / 8; i++) { 36 | uint_least32_t n = chunk[i / 4], d = 0; 37 | 38 | if (len > i) { 39 | d = *(uint8_t*)data; 40 | data = (uint8_t*)data + 1; 41 | cnt += 8; 42 | } else if (!tail_fl && i == len) { 43 | d = 0x80; 44 | tail_fl = 1; /* tail_fl is used when end of data is reached. */ 45 | } else if (tail_fl && i >= 448 / 8) { 46 | if (i == 448 / 8) 47 | length_fl = 1; /* length_fl is set when filling in length data. */ 48 | 49 | if (length_fl) { 50 | d = (cnt >> (64 - (i - 448 / 8 + 1) * 8)) & 255; 51 | 52 | if (i == 512 / 8 - 1) 53 | tail_fl = 0; /* finished with length. */ 54 | } 55 | } 56 | 57 | switch (i % 4) { 58 | case 0: 59 | n = (n & 0xffffff) | (d << 24); 60 | break; 61 | case 1: 62 | n = (n & 0xff00ffff) | (d << 16); 63 | break; 64 | case 2: 65 | n = (n & 0xffff00ff) | (d << 8); 66 | break; 67 | case 3: 68 | n = (n & 0xffffff00) | d; 69 | break; 70 | } 71 | 72 | chunk[i / 4] = n; 73 | } 74 | 75 | if (len > 512 / 8) { 76 | len -= 512 / 8; 77 | } else { 78 | len = 0; 79 | } 80 | 81 | /* transform chunk */ 82 | for (i = 16; i < 80; i++) { 83 | chunk[i] = chunk[i - 3] ^ chunk[i - 8] ^ chunk[i - 14] ^ chunk[i - 16]; 84 | chunk[i] = ROL32(chunk[i], 1); /* rotate left 1 */ 85 | } 86 | 87 | for (i = 0; i < 5; i++) 88 | v[i] = h[i]; /* set a-e to h0..4 */ 89 | 90 | for( i = 0; i < 80; i++) { 91 | uint_least32_t f, k, tmp; 92 | 93 | if (i < 20) { 94 | f = (v[1] & v[2]) | (~v[1] & v[3]); 95 | /* f=v[3]^(v[1]&(v[2]^v[3])); */ 96 | k = 0x5a827999; 97 | } else if (i < 40) { 98 | f = v[1] ^ v[2] ^ v[3]; 99 | /* f=(v[1]&v[2])^((~v[1])&v[3]) */ 100 | k = 0x6ed9eba1; 101 | } else if (i < 60) { 102 | f = (v[1] & v[2]) | (v[1] & v[3]) | (v[2] & v[3]); 103 | /* f=(v[1]&v[2])+((~v[1])&v[3]); */ 104 | k = 0x8f1bbcdc; 105 | } else { 106 | f = v[1] ^ v[2] ^ v[3]; 107 | k = 0xca62c1d6; 108 | } 109 | 110 | tmp = ROL32(v[0], 5) + f + v[4] + k + chunk[i]; /* (a ROL 5) + f + e + k + w[i] */ 111 | v[4] = v[3]; /* e = d */ 112 | v[3] = v[2]; /* d = c */ 113 | v[2] = ROL32(v[1], 30); /* c = b ROL 30 */ 114 | v[1] = v[0]; /* b = a */ 115 | v[0] = tmp; /* a = tmp */ 116 | } 117 | 118 | for(i = 0; i < 5; i++) h[i] += v[i]; /* add a-e to h0..4 */ 119 | } 120 | 121 | /* produce final hash */ 122 | for(i = 0; i < 5; i++) { 123 | *md++ = h[i] >> 24; 124 | *md++ = h[i] >> 16; 125 | *md++ = h[i] >> 8; 126 | *md++ = h[i]; 127 | } 128 | 129 | return md; 130 | } 131 | -------------------------------------------------------------------------------- /sha1tiny.h: -------------------------------------------------------------------------------- 1 | #ifndef SHA1TINY_H 2 | #define SHA1TINY_H 3 | /** 4 | * size of a SHA-1 digest in bytes. SHA-1 is 160-bit. 5 | */ 6 | #define SHA1_DIGEST_LENGTH 20 7 | 8 | unsigned char *sha1(const void *data, size_t len, unsigned char md[SHA1_DIGEST_LENGTH]); 9 | #endif 10 | -------------------------------------------------------------------------------- /test_btree.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 Jon Mayo 3 | * 4 | * Permission to use, copy, modify, and distribute this software for any 5 | * purpose with or without fee is hereby granted, provided that the above 6 | * copyright notice and this permission notice appear in all copies. 7 | * 8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 | */ 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include "btree.h" 21 | 22 | static int passes, failures; 23 | 24 | /* a copy of a very old rand() */ 25 | #define ANS_RAND_MAX 0x7fffffff 26 | 27 | static unsigned long ans_seed; 28 | 29 | static void 30 | ans_srand(unsigned long seed) 31 | { 32 | ans_seed = seed; 33 | } 34 | 35 | static unsigned 36 | ans_rand(void) 37 | { 38 | ans_seed = (1103515245 * ans_seed + 12345) & ANS_RAND_MAX; 39 | return ans_seed; 40 | } 41 | 42 | static const char * 43 | randname(unsigned len) 44 | { 45 | static char buf[64]; 46 | const char set[] = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLM" 47 | "NOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; 48 | unsigned i; 49 | 50 | assert(len > 0); 51 | if (len >= sizeof(buf) ) 52 | len = sizeof(buf) - 1; 53 | 54 | if (len <= 0) { 55 | fprintf(stderr, "ERROR:requested length is zero\n"); 56 | return NULL; 57 | } 58 | fprintf(stderr, "INFO:len=%d\n", len); 59 | 60 | for (i = 0; i < len; i++) { 61 | buf[i] = set[ans_rand() % (sizeof(set) - 1)]; 62 | } 63 | buf[i] = 0; 64 | return buf; 65 | } 66 | 67 | static int 68 | do_writes(struct btree *bt, int count, int min, int max) 69 | { 70 | char value[64]; 71 | int i; 72 | 73 | ans_srand(0); 74 | for (i = 0; i < count; i++) { 75 | struct btval key, data; 76 | int rc; 77 | 78 | key.data = (void*)randname(ans_rand() % (max - min + 1) + min); 79 | key.size = strlen(key.data); 80 | if (key.size <= 0) { 81 | fprintf(stderr, "ERROR:key is zero length\n"); 82 | return 0; 83 | } 84 | data.data = value; 85 | data.size = snprintf(value, sizeof(value), "%X", i); 86 | fprintf(stderr, "INFO:put key '%s'\n", (char*)key.data); 87 | rc = btree_put(bt, &key, &data, 0); 88 | if (rc != BT_SUCCESS) { 89 | fprintf(stderr, "ERROR:failed to write key '%s'\n", 90 | (char*)key.data); 91 | return 0; 92 | } 93 | } 94 | 95 | return 1; 96 | } 97 | 98 | static int 99 | do_reads(struct btree *bt, int count, int min, int max) 100 | { 101 | char value[64]; 102 | size_t value_len; 103 | int i; 104 | 105 | ans_srand(0); 106 | for (i = 0; i < count; i++) { 107 | struct btval key, data; 108 | int rc; 109 | 110 | key.data = (void*)randname(ans_rand() % (max - min + 1) + min); 111 | key.size = strlen(key.data); 112 | if (key.size <= 0) { 113 | fprintf(stderr, "ERROR:key is zero length\n"); 114 | return 0; 115 | } 116 | data.data = NULL; 117 | data.size = 0; 118 | value_len = snprintf(value, sizeof(value), "%X", i); 119 | fprintf(stderr, "INFO:get key '%s'\n", (char*)key.data); 120 | rc = btree_get(bt, &key, &data); 121 | if (rc != BT_SUCCESS) { 122 | fprintf(stderr, "ERROR:failed to read key '%s'\n", 123 | (char*)key.data); 124 | return 0; 125 | } 126 | if (value_len != data.size || memcmp(value, data.data, value_len)) { 127 | fprintf(stderr, "ERROR:unexpected value '%.*s' from key '%s'\n", 128 | (int)data.size, (char*)data.data, 129 | (char*)key.data); 130 | return 0; 131 | } 132 | } 133 | 134 | return 1; 135 | } 136 | 137 | static int 138 | do_deletes(struct btree *bt, int count, int min, int max) 139 | { 140 | int i; 141 | 142 | ans_srand(0); 143 | for (i = 0; i < count; i++) { 144 | struct btval key; 145 | int rc; 146 | 147 | key.data = (void*)randname(ans_rand() % (max - min + 1) + min); 148 | key.size = strlen(key.data); 149 | if (key.size <= 0) { 150 | fprintf(stderr, "ERROR:key is zero length\n"); 151 | return 0; 152 | } 153 | fprintf(stderr, "INFO:delete key '%s'\n", (char*)key.data); 154 | rc = btree_del(bt, &key, NULL); 155 | if (rc != BT_SUCCESS) { 156 | fprintf(stderr, "ERROR:failed to delete key '%s'\n", 157 | (char*)key.data); 158 | return 0; 159 | } 160 | } 161 | 162 | return 1; 163 | } 164 | 165 | static void 166 | test(const char *name, int status) 167 | { 168 | if (status) 169 | passes++; 170 | else 171 | failures++; 172 | fprintf(stderr, "%s:%s\n", status ? "PASS" : "FAILURE", name); 173 | } 174 | 175 | static void 176 | report(void) 177 | { 178 | fprintf(stderr, "RESULTS:total=%d failures=%d\n", 179 | passes + failures, failures); 180 | } 181 | 182 | int 183 | main() 184 | { 185 | struct btree *bt; 186 | const char filename[] = "test.db"; 187 | 188 | test("btree_open(BT_NOSYNC)", 189 | (bt = btree_open(filename, BT_NOSYNC, 0644)) != NULL); 190 | test("do_writes(50, 1, 33)", do_writes(bt, 50, 1, 33)); 191 | test("do_reads(50, 1, 33)", do_reads(bt, 50, 1, 33)); 192 | test("do_deletes(50, 1, 33)", do_deletes(bt, 50, 1, 33)); 193 | test("btree_close", (btree_close(bt),1)); 194 | 195 | 196 | test("btree_open(!BT_NOSYNC)", 197 | (bt = btree_open(filename, 0, 0644)) != NULL); 198 | test("do_writes(50, 1, 33)", do_writes(bt, 50, 1, 33)); 199 | test("do_reads(50, 1, 33)", do_reads(bt, 50, 1, 33)); 200 | test("do_deletes(50, 1, 33)", do_deletes(bt, 50, 1, 33)); 201 | test("btree_close", (btree_close(bt),1)); 202 | 203 | 204 | test("btree_open(BT_RDONLY)", 205 | (bt = btree_open(filename, BT_RDONLY, 0644)) != NULL); 206 | test("do_reads(50, 1, 33)", do_reads(bt, 50, 1, 33)); 207 | test("btree_close", (btree_close(bt),1)); 208 | 209 | 210 | test("btree_open(BT_NOSYNC | BT_REVERSEKEY)", 211 | (bt = btree_open(filename, BT_NOSYNC | BT_REVERSEKEY, 0644)) != NULL); 212 | test("do_writes(50, 1, 33)", do_writes(bt, 50, 1, 33)); 213 | test("do_reads(50, 1, 33)", do_reads(bt, 50, 1, 33)); 214 | test("do_deletes(50, 1, 33)", do_deletes(bt, 50, 1, 33)); 215 | test("btree_close", (btree_close(bt),1)); 216 | 217 | report(); 218 | return failures ? EXIT_FAILURE : EXIT_SUCCESS; 219 | } 220 | -------------------------------------------------------------------------------- /test_sha1tiny.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "sha1tiny.h" 4 | 5 | static const char test0[] = "The quick brown fox jumps over the lazy dog"; 6 | static const unsigned char test0_digest[SHA1_DIGEST_LENGTH] = { 7 | 0x2f, 0xd4, 0xe1, 0xc6, 0x7a, 0x2d, 0x28, 0xfc, 0xed, 0x84, 0x9e, 0xe1, 0xbb, 0x76, 0xe7, 0x39, 0x1b, 0x93, 0xeb, 0x12, 8 | }; 9 | static unsigned char test1[] = { 10 | 0x83, 0xb8, 0x08, 0xe5, 0xcd, 0x85, 0xbf, 0xe1, 0x5c, 0x27, 0x7c, 0x6a, 11 | 0x98, 0x79, 0xa0, 0x22, 0xf5, 0xaf, 0xab, 0xff, 0x86, 0x05, 0xf4, 0x8e, 12 | 0xc0, 0x32, 0x80, 0x37, 0xb2, 0xd6, 0xe9, 0x11, 0xb8, 0x68, 0xd0, 0x7d, 13 | 0xe6, 0x50, 0x28, 0x0b, 0xcb, 0x1e, 0x69, 0x53, 0xe9, 0x10, 0x83, 0xdf, 14 | 0x59, 0x54, 0xda, 0x73, 0x22, 0xa6, 0xd0, 0x82, 0x38, 0x80, 0x02, 0xb4, 15 | 0x27, 0x63, 0xd2, 0xf6, 0x01 16 | }; 17 | static const size_t test1_len = 65; 18 | static const unsigned char test1_digest[] = { 19 | 0x68, 0xb2, 0x1d, 0x8f, 0x56, 0x1a, 0xd0, 0x8e, 0x68, 0x3e, 0x61, 0x95, 0x4c, 0x66, 0xac, 0x53, 0xf1, 0x24, 0x56, 0xa7 20 | }; 21 | static unsigned char test2[] = { 22 | 0x54, 0x91, 0x06, 0x9b, 0x24, 0x00, 0x8b, 0xfd, 0x9e, 0x4a, 0x52, 0xb5 23 | }; 24 | static const size_t test2_len = 12; 25 | static const unsigned char test2_digest[] = { 26 | 0xc8, 0xd9, 0x2c, 0xb3, 0x81, 0x97, 0x1d, 0xc8, 0xe7, 0xa4, 0x1a, 0xfd, 0xb7, 0x9e, 0x0f, 0x4f, 0x86, 0x76, 0xa0, 0xef 27 | }; 28 | static unsigned char test3[] = { 29 | 0xc2, 0xba, 0xb3, 0x02, 0xfb, 0x08, 0x58, 0x87, 0xad, 0x23, 0xc7, 0x4a, 30 | 0xca, 0x45, 0x40, 0xc2, 0xa4, 0x64, 0x96, 0xae, 0x40, 0x41, 0x16, 0x0e, 31 | 0x49, 0xd7, 0xdc, 0x53, 0x7e, 0xc6, 0x5e, 0xa2, 0x1a, 0x67, 0x35, 0x0a, 32 | 0x4d, 0x4a, 0xdc, 0x9a, 0x78, 0xf4, 0xd6, 0x69, 0xb6, 0x4e, 0xb1, 0x31, 33 | 0x92, 0x8b, 0xbe, 0xcd, 0x0b, 0x4f, 0x83, 0x0f, 0xe1, 0x2e, 0x0e, 0x8c, 34 | 0xf7, 0x9e, 0xf1 35 | }; 36 | static const size_t test3_len = 63; 37 | static const unsigned char test3_digest[] = { 38 | 0xe6, 0x11, 0x4e, 0xea, 0x22, 0x13, 0x57, 0xe3, 0x48, 0x42, 0x7f, 0x5e, 0x5d, 0x6e, 0xc3, 0x59, 0x4f, 0x3d, 0x50, 0x7b 39 | }; 40 | static unsigned char test4[] = { 41 | 0x99, 0xd3, 0xf5, 0x77, 0x0f, 0x22, 0xc3, 0x34, 0x46, 0xb1, 0x4c, 0x69, 42 | 0x08, 0x9b, 0x22, 0x5b, 0x88, 0xbf, 0x10, 0x9a, 0xe4, 0x86, 0x8c, 0xa4, 43 | 0xb3, 0x5e, 0xa7, 0x1a, 0x36, 0xc8, 0x92, 0xd5, 0x06, 0x62, 0xd1, 0x65, 44 | 0x22, 0xa9, 0x33, 0xe1, 0x5a, 0x29 45 | }; 46 | static const size_t test4_len = 42; 47 | static const unsigned char test4_digest[] = { 48 | 0x0b, 0x0f, 0x2b, 0xce, 0xf8, 0x44, 0x8f, 0xfe, 0x53, 0xd2, 0x25, 0x40, 0x0d, 0xa5, 0x07, 0x17, 0x84, 0x3a, 0xab, 0x33 49 | }; 50 | 51 | static void sha1_print_digest(const unsigned char *md) 52 | { 53 | unsigned i; 54 | 55 | for(i = 0; i < SHA1_DIGEST_LENGTH; i++) { 56 | printf("%02X", md[i]); 57 | 58 | if(i < SHA1_DIGEST_LENGTH - 1) printf(":"); 59 | } 60 | 61 | printf("\n"); 62 | } 63 | 64 | static int sha1_test(void) 65 | { 66 | struct { 67 | const void *data; 68 | size_t len; 69 | const unsigned char *digest; 70 | } test[] = { 71 | { test0, strlen(test0), test0_digest }, 72 | { test1, test1_len, test1_digest }, 73 | { test2, test2_len, test2_digest }, 74 | { test3, test3_len, test3_digest }, 75 | { test4, test4_len, test4_digest }, 76 | }; 77 | unsigned char digest[SHA1_DIGEST_LENGTH]; 78 | unsigned i; 79 | 80 | printf("Running %zd tests.\n", sizeof test / sizeof * test); 81 | 82 | for(i = 0; i < sizeof test / sizeof * test; i++) { 83 | memset(digest, 0, sizeof digest); 84 | 85 | if(!sha1(test[i].data, test[i].len, digest)) { 86 | printf("failed.\n"); 87 | return 0; 88 | } 89 | 90 | /* TODO: dump in a clean way 91 | printf("test%d[%d] = \"%s\"\n", i, test[i].len, test[i].data); 92 | */ 93 | 94 | printf("test%d length = %zd\n", i, test[i].len); 95 | 96 | printf("calculated : "); 97 | sha1_print_digest(digest); 98 | 99 | printf("known : "); 100 | sha1_print_digest(test[i].digest); 101 | 102 | printf("test%d: %s\n", i, memcmp(digest, test[i].digest, SHA1_DIGEST_LENGTH) ? "FAILED" : "PASSED"); 103 | } 104 | 105 | return 0; 106 | } 107 | 108 | int main() 109 | { 110 | return sha1_test() ? 0 : 1; 111 | } 112 | --------------------------------------------------------------------------------