├── .gitignore ├── all.do ├── LICENSE ├── rolla_test.c ├── rolla.h ├── README.md └── rolla.c /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /all.do: -------------------------------------------------------------------------------- 1 | export CFLAGS="-g -Wall -Werror -pedantic -std=gnu99 -O2 -fno-strict-aliasing" 2 | gcc $CFLAGS -o rolla_test rolla.c rolla_test.c 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | rolla is copyright (c) 2013 Jamie Turner and Bump Technologies, Inc, 3 | and authors and contributors 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions 7 | are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 3. The names of the authors and copyright holders may not be used to 15 | endorse or promote products derived from this software without 16 | specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 19 | INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 20 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 21 | THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 24 | OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 25 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 26 | OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 27 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /rolla_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "rolla.h" 9 | 10 | #define COUNT 1000000 11 | #define DCOUNT ((double)COUNT) 12 | 13 | double doublenow() { 14 | struct timeval tv; 15 | gettimeofday(&tv, NULL); 16 | 17 | return (double)tv.tv_sec 18 | + (((double)tv.tv_usec) / 1000000.0); 19 | } 20 | 21 | int main() { 22 | double start; 23 | printf("-- load --\n"); 24 | start = doublenow(); 25 | rolla *db = rolla_create("db"); 26 | printf("load took %.3f\n", 27 | doublenow() - start); 28 | 29 | int i; 30 | char buf2[8] = {0}; 31 | printf("-- write --\n"); 32 | 33 | start = doublenow(); 34 | for (i=0; i < COUNT; i++) { 35 | snprintf(buf2, 8, "%d", i % 2 ? i : 4); 36 | rolla_set(db, buf2, (uint8_t *)buf2, 8); 37 | } 38 | double final; 39 | final = doublenow(); 40 | printf("write took %.3f (%.3f/s)\n", 41 | final - start, DCOUNT / (final - start)); 42 | sleep(3); 43 | 44 | printf("-- read --\n"); 45 | uint32_t sz; 46 | start = doublenow(); 47 | for (i=0; i < COUNT; i++) { 48 | snprintf(buf2, 8, "%d", i % 2 ? i : 4); 49 | char *p = (char *)rolla_get(db, buf2, &sz); 50 | assert(p && !strcmp(buf2, p)); 51 | free(p); 52 | if (i % 100000 == 0) 53 | printf("%d\n", i); 54 | } 55 | final = doublenow(); 56 | printf("read took %.3f (%.3f/s)\n", 57 | final - start, DCOUNT / (final - start)); 58 | 59 | snprintf(buf2, 8, "%d", 4); 60 | char *p = (char *)rolla_get(db, buf2, &sz); 61 | assert(p); 62 | free(p); 63 | rolla_del(db, buf2); 64 | p = (char *)rolla_get(db, buf2, &sz); 65 | assert(!p); 66 | 67 | rolla_close(db, 1); 68 | 69 | return 0; 70 | } 71 | -------------------------------------------------------------------------------- /rolla.h: -------------------------------------------------------------------------------- 1 | #ifndef ROLLA_H 2 | #define ROLLA_H 3 | 4 | #include 5 | 6 | /* The whole system swings on this parameter... 7 | 8 | Set it to something large (128k to 256k buckets) 9 | for something with tens of millions of keys. 10 | 256k will need at least ~1MB per database. 11 | 12 | Keep it small (8k) for very good performance with 13 | low memory overhead (<40kB) on embedded systems 14 | with less than 1M keys. 15 | */ 16 | #define NUMBUCKETS (1024 * 8) 17 | 18 | /* This is your database, friend. */ 19 | typedef struct rolla rolla; 20 | 21 | /* Create a database (in a single file) at `path`. 22 | If `path` does not exist, it will be created; 23 | otherwise, it will be loaded. */ 24 | rolla * rolla_create(char *path); 25 | 26 | /* Force the database to be sync'd to disk (msync) */ 27 | void rolla_sync(rolla *r); 28 | 29 | /* Close the database; compress=1 means rewrite the database 30 | to eliminate redundant values for each single key */ 31 | void rolla_close(rolla *r, int compress); 32 | 33 | /* Set `key` to byte array `val` of `vlen` bytes; you still 34 | own key and val, they are not retained */ 35 | void rolla_set(rolla *r, char *key, uint8_t *val, uint32_t vlen); 36 | 37 | /* Get the value for `key`, which will be `*len` bytes long. 38 | NULL will be returned if the key is not found. Otherwise, 39 | a value will be returned to you that is allocated on the heap. 40 | You own it, you must free() it eventually. */ 41 | uint8_t * rolla_get(rolla *r, char *key, uint32_t *len); 42 | 43 | /* Remove the value `key` from the database. Harmless NOOP 44 | if `key` does not exist */ 45 | void rolla_del(rolla *r, char *key); 46 | 47 | /* Iterate over all keys in the database. See the note in the 48 | README.md about caveats associated with iteration and mutation */ 49 | typedef void(*rolla_iter_cb) (rolla *r, char *key, uint8_t *val, uint32_t length, void *passthrough); 50 | void rolla_iter(rolla *r, rolla_iter_cb cb, void *passthrough); 51 | 52 | #endif /* ROLLA_H */ 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rolla 2 | === 3 | 4 | `rolla` is a simple key/value storage engine. It aims to achieve a good balance 5 | of fast query performance, low disk space, and data safety while being stingy 6 | with memory. It was designed primarily for embedding in mobile applications. 7 | 8 | Features 9 | -------- 10 | 11 | * Database is pure C and relies only on the C standard library 12 | * Database is crash-only design; database files cannot go corrupt 13 | * Inserts are O(1) 14 | * Queries are O(1) .. O(n), though on typical workloads with hot 15 | spots the performance is closer to O(1) due to scanning hash buckets 16 | in write-recency order 17 | * An idle (but fully loaded) database uses < 40kB of memory with 18 | default tuning parameters 19 | * Database files can compact on close, so amortized disk use is O(keys) 20 | * A key enumeration interface is supported 21 | * On-disk representation has only 10 bytes of overhead for each 22 | key/value pair 23 | * Already in use on millions of iOS and Android devices (armv7) 24 | 25 | Caveats 26 | ------- 27 | 28 | * `rolla` databases are **NOT THREAD SAFE**. If you want to access a 29 | `rolla` database from multiple threads concurrently, you'll have to 30 | serialize access in a higher layer. The library is fully 31 | re-entrant, however--so you can use *different* databases 32 | in multiple threads concurrently. 33 | * Keys are null-terminated C-Strings, not bytestrings 34 | * Keys must be <=255 bytes 35 | * An entire database cannot exceed 4GB 36 | * Misses are expensive (they must read approx 1/8192 of the database 37 | off disk to invalidate the key) 38 | * Compaction only happens when the database is closed and reloaded, 39 | it cannot be done while online 40 | * Files are assumed little endian and are not portable across 41 | architectures of different byte orderings 42 | * `rolla` does not call `fsync()` or `msync()`, but instead relies on 43 | the operating system's schedule. Some data that was not totally 44 | committed to the backing hardware could be lost if system failure 45 | happened (though the database will come up cleanly with a subset of 46 | records, "repair" is never necessary). Applications can 47 | call rolla_sync() if they want to force msync more often. 48 | * A database created with a certain bucket count must always be 49 | used with that bucket count. The bucket count cannot change. 50 | 51 | Design 52 | ------ 53 | 54 | Rolla has a very simple design. 55 | 56 | When a rolla database is created, an array of buckets is allocated 57 | (by default 8192). Database keys are hashed onto these buckets. 58 | The bucket values contain the offset to the _last_ entry in the 59 | file on disk that contains a key that was hashed into that bucket. 60 | 61 | On a read, this last value is checked. If the key matches, the 62 | value is returned; otherwise, the record at this offset in the 63 | file contains a link back to the *previous* offset in this bucket, 64 | and so on. Traversal continues until a record is encountered 65 | with a sentinel value that indicates there are no more records 66 | in this bucket. This is why misses (key not found) are 67 | expensive in rolla. 68 | 69 | A write is simply a matter of overwriting the in-memory bucket 70 | value to point to the current EOF marker in the on-disk file, 71 | then taking the old value from this bucket and writing it out 72 | to the disk as the previous link in the chain, along with the 73 | new key and value. In this way, writes are always appends. 74 | 75 | Mutating the same key over and over again will grow the 76 | database without bound. `rolla_close()` takes a `compress` 77 | flag that will walk the database and write a new version 78 | with only the current values for each key. It is important 79 | to close and compress active rolla databases regularly to 80 | both preserve disk space and prevent them from exceeding 81 | their 4GB limit. Compaction is done in a crash-safe manner 82 | using a tempfile and `rename()`. 83 | 84 | When a database is reloaded, the file is walked and the 85 | bucket array is re-populated with the final offsets. Any 86 | partial records are truncated in case of a crash. 87 | 88 | Rolla uses the `mmap()` syscall to memory-map this file as 89 | an array of records (C structs). 90 | 91 | Moving the NUMBUCKETS constant up and down can tune rolla 92 | for different expected key counts. Clearly, larger bucket 93 | counts result in fewer seeks on disk because only a 94 | very few keys will map to that bucket, and link walking is 95 | minimal or absent. 96 | 97 | Rolla was designed to run on 32-bit systems, so offset 98 | values are typically unsigned 32-bit integers. This limits 99 | total database size to ~4GB. 100 | 101 | Usage 102 | ----- 103 | 104 | You can build a quick rolla_test program using redo. Install this: 105 | 106 | https://github.com/apenwarr/redo 107 | 108 | ... then type `redo` in this repository root. 109 | 110 | But generally, just build rolla.c into your project. Recommended 111 | compiler flags are: 112 | 113 | CFLAGS="-g -Wall -Werror -pedantic -std=gnu99 -O2 -fno-strict-aliasing" 114 | 115 | See `rolla.h` for API. set, get, del, sync, create, close, iter. It's 116 | a key/value storage engine! 117 | 118 | Iteration Note 119 | -------------- 120 | 121 | If you change values (set or del) in `rolla_iter()`, make 122 | sure to pass *copies* of keys to set and del. The 123 | keys given to you by rolla are being borrowed from the 124 | mmaped file, and if the memory mapped region should move due to 125 | database growth caused by your mutation, those pointers will 126 | become invalid. 127 | 128 | Benchmarks 129 | ---------- 130 | 131 | **Hardware** 132 | 133 | VMWare Fusion 134 | Arch Linux 3.3.7-1-ARCH #1 SMP PREEMPT x64 135 | Intel(R) Core(TM) i7-2677M CPU @ 1.80GHz 136 | Crucial 256GB SSD 137 | 1.4GB RAM (likely on-disk file fully in page cache) 138 | 139 | **Data** 140 | 141 | Keys and values are 1-6 bytes [1..1000000]. 142 | 143 | **Test 1** 144 | 145 | NUMBUCKETS=8192 146 | 100,000 writes, and 100,000 reads 147 | writes 3609742.327/s 148 | reads 4034846.852/s 149 | (note: this is the sweet spot for this library) 150 | 151 | **Test 2** 152 | 153 | NUMBUCKETS=8192 154 | 1,000,000 writes, and 1,000,000 reads 155 | writes 5558771.063/s 156 | reads 621547.615/s 157 | (note: too few buckets, too much chaining) 158 | 159 | **Test 3** 160 | 161 | NUMBUCKETS=262144 162 | 1,000,000 writes, and 1,000,000 reads 163 | writes 4818579.943/s 164 | reads 3342098.868/s 165 | (note: less chaining and more req'd ram.. ~1MB) 166 | 167 | Author 168 | ------ 169 | 170 | Jamie Turner and others at Bump Technologies, Inc. 171 | -------------------------------------------------------------------------------- /rolla.c: -------------------------------------------------------------------------------- 1 | #include "rolla.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define NO_BACKTRACE ((uint32_t)0xffffffff) 16 | #define CACHE_SIZE (32 * 1024) 17 | #define DEBUG_PRINT_TRUNCATE 1 18 | 19 | struct rolla { 20 | uint32_t offsets[NUMBUCKETS]; 21 | int mapfd; 22 | char *path; 23 | uint8_t *map; 24 | uint32_t mmap_alloc; 25 | uint32_t eof; 26 | uint8_t cache[CACHE_SIZE]; 27 | int cache_used; 28 | }; 29 | 30 | static void rolla_remap(rolla *r) { 31 | if (r->map) { 32 | msync(r->map, r->mmap_alloc, MS_SYNC); 33 | int s = munmap(r->map, r->mmap_alloc); 34 | assert(!s); 35 | } 36 | r->mmap_alloc = r->eof; 37 | if (r->mmap_alloc) { 38 | r->map = (uint8_t *)mmap( 39 | NULL, r->mmap_alloc, PROT_READ, MAP_SHARED, r->mapfd, 0); 40 | } else { 41 | r->map = NULL; 42 | } 43 | r->cache_used = 0; 44 | } 45 | 46 | static uint32_t jenkins_one_at_a_time_hash(char *key, size_t len); 47 | 48 | #define hash jenkins_one_at_a_time_hash 49 | 50 | static uint32_t rolla_index_lookup(rolla *r, char *key) { 51 | uint32_t fh = hash(key, strlen(key)) % NUMBUCKETS; 52 | return r->offsets[fh]; 53 | } 54 | 55 | static uint32_t rolla_index_keyval(rolla *r, char *key, uint32_t off) { 56 | uint32_t fh = hash(key, strlen(key)) % NUMBUCKETS; 57 | 58 | uint32_t last = r->offsets[fh]; 59 | r->offsets[fh] = off; 60 | 61 | return last; 62 | } 63 | 64 | static void rolla_load(rolla *r) { 65 | 66 | struct stat st; 67 | 68 | r->mapfd = open(r->path, 69 | O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); 70 | assert(r->mapfd > -1); 71 | 72 | int s = fstat(r->mapfd, &st); 73 | if (!s) { 74 | r->eof = st.st_size; 75 | } 76 | else { 77 | r->eof = 0; 78 | } 79 | 80 | rolla_remap(r); 81 | 82 | if (r->map) { 83 | int save_eof = r->eof; 84 | 85 | uint8_t *p = r->map; 86 | uint32_t off = 0; 87 | while (1) { 88 | if (off == r->eof) { 89 | break; /* no recovery, clean shutdown */ 90 | } 91 | if (off + 9 > r->eof) { 92 | #if(DEBUG_PRINT_TRUNCATE) 93 | fprintf(stderr, "rolla: recovering truncated db!\n"); 94 | #endif 95 | r->eof = off; 96 | break; 97 | } 98 | unsigned char klen = *(uint8_t *)p; 99 | uint32_t vlen = *(uint32_t *)(p + 1); 100 | uint32_t last = *(uint32_t *)(p + 5); 101 | uint32_t jump = 9 + klen + vlen; 102 | 103 | if (klen == 0 || (off + jump > r->eof)) { 104 | #if(DEBUG_PRINT_TRUNCATE) 105 | fprintf(stderr, "rolla: recovering truncated db!\n"); 106 | #endif 107 | r->eof = off; 108 | break; 109 | } 110 | 111 | char *key = (char *)(p + 9); 112 | 113 | uint32_t prev = rolla_index_keyval(r, key, off); 114 | assert(prev == last); 115 | 116 | off += jump; 117 | p += jump; 118 | } 119 | 120 | if (save_eof != r->eof) { 121 | s = ftruncate(r->mapfd, (off_t)r->eof); 122 | assert(!s); 123 | rolla_remap(r); 124 | } 125 | 126 | lseek(r->mapfd, r->eof, SEEK_SET); 127 | } 128 | } 129 | 130 | rolla * rolla_create(char *path) { 131 | rolla *r = calloc(1, sizeof(rolla)); 132 | r->path = malloc(strlen(path) + 1); 133 | strcpy(r->path, path); 134 | r->mmap_alloc = 0; 135 | r->cache_used = 0; 136 | memset(r->offsets, 0xff, NUMBUCKETS * sizeof(uint32_t)); 137 | 138 | rolla_load(r); 139 | 140 | return r; 141 | } 142 | 143 | uint8_t * rolla_get(rolla *r, char *key, uint32_t *len) { 144 | uint32_t off = rolla_index_lookup(r, key); 145 | 146 | while (off != NO_BACKTRACE) { 147 | uint8_t *p; 148 | if (off >= r->mmap_alloc) { 149 | p = &r->cache[off - r->mmap_alloc]; 150 | } else { 151 | p = &r->map[off]; 152 | } 153 | uint8_t klen = *(uint8_t *)p; 154 | if (!strncmp((char *)(p + 9), key, klen)) { 155 | uint32_t vlen = *(uint32_t *)(p + 1); 156 | if (vlen == 0) { 157 | return NULL; 158 | } 159 | uint8_t *res = malloc(vlen); 160 | *len = vlen; 161 | memmove(res, p + 9 + klen, vlen); 162 | return res; 163 | } 164 | off = *(uint32_t *)(p + 5); 165 | } 166 | 167 | return NULL; 168 | } 169 | 170 | void rolla_sync(rolla *r) { 171 | int s = fsync(r->mapfd); 172 | assert(!s); 173 | } 174 | 175 | void rolla_set(rolla *r, char *key, uint8_t *val, uint32_t vlen) { 176 | uint8_t klen = strlen(key) + 1; 177 | uint32_t step = 9 + klen + vlen; 178 | /* write to the mapfd */ 179 | uint32_t last = rolla_index_keyval(r, key, r->eof); 180 | struct iovec iov[] = { 181 | {&klen, sizeof(uint8_t)}, 182 | {&vlen, sizeof(uint32_t)}, 183 | {&last, sizeof(uint32_t)}, 184 | {key, klen}, 185 | {val, vlen}}; 186 | 187 | int bwrite = writev(r->mapfd, iov, 5); 188 | assert(bwrite == step); 189 | r->eof += step; 190 | 191 | if (r->cache_used + step > CACHE_SIZE) { 192 | rolla_remap(r); 193 | } 194 | else { 195 | /* write to trailing cache */ 196 | uint8_t *p = &r->cache[r->cache_used]; 197 | 198 | *((uint8_t *)p) = klen; 199 | *((uint32_t *)(p + 1)) = vlen; 200 | *((uint32_t *)(p + 5)) = last; 201 | memmove(p + 9, key, klen); 202 | if (vlen) 203 | memmove(p + 9 + klen, val, vlen); 204 | r->cache_used += step; 205 | } 206 | } 207 | 208 | void rolla_del(rolla *r, char *key) { 209 | rolla_set(r, key, NULL, 0); 210 | } 211 | 212 | void rolla_iter(rolla *r, rolla_iter_cb cb, void *passthrough) { 213 | int i; 214 | 215 | int sl = 10 * 1024; 216 | char *s = realloc(NULL, sl); 217 | 218 | rolla_remap(r); /* since we do not use the write-ahead cache */ 219 | 220 | for (i = 0; i < NUMBUCKETS; i++) { 221 | uint32_t search_off = 0; 222 | s[0] = 0; 223 | uint32_t off = r->offsets[i]; 224 | char buf[258]; 225 | buf[0] = 1; 226 | while (off != NO_BACKTRACE) { 227 | uint8_t klen = *(uint8_t *)(r->map + off); 228 | char *key = (char *)(r->map + off + 9); 229 | strcpy(buf + 1, key); 230 | buf[klen] = 1; 231 | buf[klen + 1] = 0; 232 | uint32_t skey_len = klen + 2; 233 | if (!strstr(s, buf)) { 234 | /* not found */ 235 | uint32_t vlen = *(uint32_t *)(r->map + off + 1); 236 | if (vlen) { 237 | uint8_t *val = (uint8_t *)(r->map + off + klen + 9); 238 | cb(r, key, val, vlen, passthrough); 239 | } 240 | if (search_off + skey_len >= sl) { 241 | sl *= 2; 242 | s = realloc(s, sl); 243 | } 244 | strcpy(s + search_off, buf); 245 | search_off += skey_len - 1; /* trailing \0 */ 246 | } 247 | 248 | off = *(uint32_t *)(r->map + off + 5); 249 | } 250 | } 251 | 252 | free(s); 253 | } 254 | 255 | static void rolla_rewrite_cb(rolla *r, char *key, uint8_t *data, uint32_t length, void *pass) { 256 | rolla *new = (rolla *)pass; 257 | rolla_set(new, key, data, length); 258 | } 259 | 260 | void rolla_close(rolla *r, int compress) { 261 | char path[1200] = {0}; 262 | if (compress) { 263 | assert(strlen(r->path) < 1100); 264 | strcat(path, r->path); 265 | strcat(path, ".rolla_rewrite"); 266 | 267 | rolla *tmp = rolla_create(path); 268 | rolla_iter(r, rolla_rewrite_cb, (void *)tmp); 269 | rolla_close(tmp, 0); 270 | } 271 | 272 | munmap(r->map, r->mmap_alloc); 273 | close(r->mapfd); 274 | 275 | if (compress) { 276 | rename(path, r->path); 277 | } 278 | 279 | free(r->path); 280 | free(r); 281 | } 282 | 283 | /* From Bob Jenkins/Dr. Dobbs */ 284 | static uint32_t jenkins_one_at_a_time_hash(char *key, size_t len) 285 | { 286 | uint32_t hash, i; 287 | for(hash = i = 0; i < len; ++i) 288 | { 289 | hash += key[i]; 290 | hash += (hash << 10); 291 | hash ^= (hash >> 6); 292 | } 293 | hash += (hash << 3); 294 | hash ^= (hash >> 11); 295 | hash += (hash << 15); 296 | return hash; 297 | } 298 | --------------------------------------------------------------------------------