├── .gitignore ├── LICENSE ├── cdb.c ├── cdb.h ├── host.c ├── host.h ├── main.c ├── makefile ├── readme.md └── t /.gitignore: -------------------------------------------------------------------------------- 1 | *.1 2 | *.a 3 | *.bin 4 | *.cdb 5 | *.db 6 | *.dll 7 | *.exe 8 | *.o 9 | *.so 10 | *.tgz 11 | *.tmp 12 | *.txt 13 | cdb 14 | install/* 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /cdb.c: -------------------------------------------------------------------------------- 1 | /* Program: Constant Database Library 2 | * Author: Richard James Howe 3 | * Email: howe.r.j.89@gmail.com 4 | * License: Unlicense 5 | * Repo: 6 | * 7 | * Consult the "readme.md" file for a detailed description 8 | * of the file format and internals. */ 9 | 10 | #include "cdb.h" 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #ifndef CDB_VERSION 17 | #define CDB_VERSION (0x000000ul) /* all zeros = built incorrectly (set in makefile) */ 18 | #endif 19 | 20 | #ifndef CDB_TESTS_ON 21 | #define CDB_TESTS_ON (1) 22 | #endif 23 | 24 | #ifndef CDB_WRITE_ON 25 | #define CDB_WRITE_ON (1) 26 | #endif 27 | 28 | #ifndef CDB_MEMORY_INDEX_ON /* use in memory hash table if '1' for first table */ 29 | #define CDB_MEMORY_INDEX_ON (0) 30 | #endif 31 | 32 | #ifndef CDB_READ_BUFFER_LENGTH 33 | #define CDB_READ_BUFFER_LENGTH (256ul) 34 | #endif 35 | 36 | #ifndef cdb_assert 37 | #define cdb_assert(X) (assert((X))) 38 | #endif 39 | 40 | #define cdb_implies(P, Q) cdb_assert(!(P) || (Q)) 41 | 42 | #define CDB_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 43 | #define CDB_MIN(X, Y) ((X) < (Y) ? (X) : (Y)) 44 | #define CDB_NBUCKETS (8ul) 45 | #define CDB_BUCKETS (1ul << CDB_NBUCKETS) 46 | #define CDB_FILE_START (0ul) 47 | 48 | /* This enumeration is here and not in the header deliberately, it is to 49 | * stop error codes becoming part of the API for this library. */ 50 | enum { 51 | CDB_OK_E = 0, /* no error */ 52 | CDB_NOT_FOUND_E = 0, /* key: not-found */ 53 | CDB_FOUND_E = 1, /* key: found */ 54 | CDB_ERROR_E = -1, /* generic error */ 55 | CDB_ERROR_HASH_E = -2, /* unexpected hash value given bucket */ 56 | CDB_ERROR_BOUND_E = -3, /* pointers out of bounds */ 57 | CDB_ERROR_OVERFLOW_E = -4, /* some calculation overflowed and should not have */ 58 | CDB_ERROR_OPEN_E = -5, /* open failed */ 59 | CDB_ERROR_SEEK_E = -6, /* seek failed */ 60 | CDB_ERROR_WRITE_E = -7, /* write failed to write any/enough bytes */ 61 | CDB_ERROR_READ_E = -8, /* read failed to read any/enough bytes */ 62 | CDB_ERROR_ALLOCATE_E = -9, /* reallocate/allocation failed */ 63 | CDB_ERROR_FREE_E = -10, /* free failed */ 64 | CDB_ERROR_MODE_E = -11, /* incorrect mode for operation */ 65 | CDB_ERROR_DISABLED_E = -12, /* unimplemented/disabled feature */ 66 | CDB_ERROR_SIZE_E = -13, /* invalid/unsupported size */ 67 | }; 68 | 69 | typedef struct { 70 | cdb_word_t position; /* position on disk of this hash table, when known */ 71 | cdb_word_t length; /* number of buckets in hash table */ 72 | } cdb_hash_header_t; /* initial hash table structure */ 73 | 74 | /* NB. More is allocated than needed for the memory index, it 75 | * would make things ugly to correct this however, so it will not be. */ 76 | typedef struct { 77 | cdb_word_t *hashes; /* full key hashes */ 78 | cdb_word_t *fps; /* file pointers */ 79 | cdb_hash_header_t header; /* header for this hash table */ 80 | } cdb_hash_table_t; /* secondary hash table structure */ 81 | 82 | struct cdb { /* constant database handle: for all your querying needs! */ 83 | cdb_options_t ops; /* custom file/flash operators */ 84 | void *file; /* database handle */ 85 | cdb_word_t file_start, /* start position of structures in file */ 86 | file_end, /* end position of database in file, if known, zero otherwise */ 87 | hash_start; /* start of secondary hash tables near end of file, if known, zero otherwise */ 88 | cdb_word_t position; /* read/write/seek position: be careful with this variable! */ 89 | int error; /* error, if any, any error causes database to be invalid */ 90 | unsigned create : 1, /* have we opened database up in create mode? */ 91 | opened : 1, /* have we successfully opened up the database? */ 92 | empty : 1, /* is the database empty? */ 93 | sought : 1; /* have we performed at least one seek (needed to position init cache) */ 94 | cdb_hash_table_t table1[]; /* only allocated if in create mode, BUCKETS elements are allocated */ 95 | }; 96 | 97 | /* To make the library easier to use we could provide a set of default 98 | * allocators (that when compiled out always return an error), the non-thread 99 | * safe allocator would return a pointer to a statically declared variable and 100 | * mark it as being used. */ 101 | 102 | int cdb_version(unsigned long *version) { 103 | CDB_BUILD_BUG_ON(sizeof(cdb_word_t) != 2 && sizeof(cdb_word_t) != 4 && sizeof(cdb_word_t) != 8); 104 | cdb_assert(version); 105 | unsigned long spec = ((sizeof (cdb_word_t)) * CHAR_BIT) >> 4; /* Lowest three bits = size */ 106 | spec |= CDB_TESTS_ON << 4; 107 | spec |= CDB_WRITE_ON << 5; 108 | spec |= CDB_MEMORY_INDEX_ON << 6; 109 | /*spec |= 0 << 7; */ 110 | *version = (spec << 24) | CDB_VERSION; 111 | return CDB_VERSION == 0 ? CDB_ERROR_E : CDB_OK_E; 112 | } 113 | 114 | int cdb_status(cdb_t *cdb) { 115 | cdb_assert(cdb); 116 | return cdb->error; 117 | } 118 | 119 | static inline size_t cdb_get_size(cdb_t *cdb) { 120 | cdb_assert(cdb); 121 | return cdb->ops.size; 122 | } 123 | 124 | static inline uint64_t cdb_get_mask(cdb_t *cdb) { 125 | cdb_assert(cdb); 126 | const size_t l = cdb_get_size(cdb); 127 | if (l == 16/CHAR_BIT) 128 | return UINT16_MAX; 129 | if (l == 32/CHAR_BIT) 130 | return UINT32_MAX; 131 | cdb_assert(l == 64/CHAR_BIT); 132 | return UINT64_MAX; 133 | } 134 | 135 | /* This is not 'djb2' hash - the character is xor'ed in and not added. This 136 | * has sometimes been called 'DJB2a'. */ 137 | static inline uint32_t cdb_djb_hash(const uint8_t *s, const size_t length) { 138 | cdb_assert(s); 139 | uint32_t h = 5381ul; 140 | for (size_t i = 0; i < length; i++) 141 | h = ((h << 5ul) + h) ^ s[i]; /* (h * 33) xor c */ 142 | return h; 143 | } 144 | 145 | static int cdb_memory_compare(const void *a, const void *b, size_t length) { 146 | cdb_assert(a); 147 | cdb_assert(b); 148 | return memcmp(a, b, length); 149 | } 150 | 151 | typedef cdb_word_t (*cdb_hash_fn)(const uint8_t *s, const size_t length); 152 | 153 | cdb_word_t cdb_hash(const uint8_t *s, const size_t length) { 154 | cdb_assert(s); 155 | return cdb_djb_hash(s, length); 156 | } 157 | 158 | /* A 64-bit hash has to be used for the 64-bit database version otherwise if 159 | * we used a 32-bit hash all of our keys and values would be 160 | * stored...suboptimally. */ 161 | static cdb_word_t cdb_hash64(const uint8_t *s, const size_t length) { 162 | cdb_assert(s); 163 | /* SDBM hash see: 164 | and */ 165 | assert(sizeof(cdb_word_t) >= sizeof(uint64_t)); 166 | uint64_t hash = 0xA5A5A5A5A5A5A5A5ull; 167 | for (size_t i = 0; i < length; i++) { 168 | const uint64_t ch = s[i]; 169 | hash = ch + (hash << 6) + (hash << 16) - hash; 170 | } 171 | return hash; 172 | } 173 | 174 | static void cdb_preconditions(cdb_t *cdb) { 175 | cdb_assert(cdb); 176 | cdb_implies(cdb->file_end != 0, cdb->file_end > cdb->file_start); 177 | cdb_implies(cdb->hash_start != 0, cdb->hash_start > cdb->file_start); 178 | cdb_assert(cdb->ops.allocator); 179 | cdb_assert(cdb->ops.read); 180 | cdb_assert(cdb->ops.open); 181 | cdb_assert(cdb->ops.close); 182 | cdb_assert(cdb->ops.seek); 183 | cdb_assert(cdb->error <= 0); 184 | cdb_implies(cdb->create, cdb->ops.write); 185 | /*cdb_assert(cdb->error == 0);*/ 186 | } 187 | 188 | static inline int cdb_failure(cdb_t *cdb) { 189 | cdb_preconditions(cdb); 190 | return cdb->error ? CDB_ERROR_E : CDB_OK_E; 191 | } 192 | 193 | static inline int cdb_error(cdb_t *cdb, const int error) { 194 | cdb_preconditions(cdb); 195 | if (cdb->error == 0) 196 | cdb->error = error; 197 | return cdb_failure(cdb); 198 | } 199 | 200 | static inline int cdb_bound_check(cdb_t *cdb, const int fail) { 201 | cdb_assert(cdb); 202 | return cdb_error(cdb, fail ? CDB_ERROR_BOUND_E : CDB_OK_E); 203 | } 204 | 205 | static inline int cdb_hash_check(cdb_t *cdb, const int fail) { 206 | cdb_assert(cdb); 207 | return cdb_error(cdb, fail ? CDB_ERROR_HASH_E : CDB_OK_E); 208 | } 209 | 210 | static inline int cdb_overflow_check(cdb_t *cdb, const int fail) { 211 | cdb_assert(cdb); 212 | return cdb_error(cdb, fail ? CDB_ERROR_OVERFLOW_E : CDB_OK_E); 213 | } 214 | 215 | static inline int cdb_free(cdb_t *cdb, void *p) { 216 | cdb_assert(cdb); 217 | if (!p) 218 | return 0; 219 | (void)cdb->ops.allocator(cdb->ops.arena, p, 0, 0); 220 | return 0; 221 | } 222 | 223 | static inline void *cdb_allocate(cdb_t *cdb, const size_t length) { 224 | cdb_preconditions(cdb); 225 | void *r = cdb->ops.allocator(cdb->ops.arena, NULL, 0, length); 226 | if (length != 0 && r == NULL) 227 | (void)cdb_error(cdb, CDB_ERROR_ALLOCATE_E); 228 | return r ? memset(r, 0, length) : NULL; 229 | } 230 | 231 | static inline void *cdb_reallocate(cdb_t *cdb, void *pointer, const size_t length) { 232 | cdb_preconditions(cdb); 233 | void *r = cdb->ops.allocator(cdb->ops.arena, pointer, 0, length); 234 | if (length != 0 && r == NULL) 235 | (void)cdb_error(cdb, CDB_ERROR_ALLOCATE_E); 236 | return r; 237 | } 238 | 239 | /* NB. A seek can cause buffers to be flushed, which degrades performance quite a lot */ 240 | static int cdb_seek_internal(cdb_t *cdb, const cdb_word_t position) { 241 | cdb_preconditions(cdb); 242 | if (cdb->error) 243 | return -1; 244 | if (cdb->opened && cdb->create == 0) 245 | if (cdb_bound_check(cdb, position < cdb->file_start || cdb->file_end < position)) 246 | return -1; 247 | if (cdb->sought == 1u && cdb->position == position) 248 | return cdb_error(cdb, CDB_OK_E); 249 | const int r = cdb->ops.seek(cdb->file, position + cdb->ops.offset); 250 | if (r >= 0) { 251 | cdb->position = position; 252 | cdb->sought = 1u; 253 | } 254 | return cdb_error(cdb, r < 0 ? CDB_ERROR_SEEK_E : CDB_OK_E); 255 | } 256 | 257 | int cdb_seek(cdb_t *cdb, const cdb_word_t position) { 258 | cdb_preconditions(cdb); 259 | if (cdb_error(cdb, cdb->create != 0 ? CDB_ERROR_MODE_E : 0)) 260 | return 0; 261 | return cdb_seek_internal(cdb, position); 262 | } 263 | 264 | static cdb_word_t cdb_read_internal(cdb_t *cdb, void *buf, cdb_word_t length) { 265 | cdb_preconditions(cdb); 266 | cdb_assert(buf); 267 | if (cdb_error(cdb, cdb->create != 0 ? CDB_ERROR_MODE_E : 0)) 268 | return 0; 269 | const cdb_word_t r = cdb->ops.read(cdb->file, buf, length); 270 | const cdb_word_t n = cdb->position + r; 271 | if (cdb_overflow_check(cdb, n < cdb->position) < 0) 272 | return 0; 273 | cdb->position = n; 274 | return r; 275 | } 276 | 277 | int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length) { 278 | cdb_preconditions(cdb); 279 | const cdb_word_t r = cdb_read_internal(cdb, buf, length); 280 | return cdb_error(cdb, r != length ? CDB_ERROR_READ_E : 0); 281 | } 282 | 283 | static cdb_word_t cdb_write(cdb_t *cdb, void *buf, size_t length) { 284 | cdb_preconditions(cdb); 285 | cdb_assert(buf); 286 | if (cdb_error(cdb, cdb->create == 0 ? CDB_ERROR_MODE_E : 0)) 287 | return 0; 288 | const cdb_word_t r = cdb->ops.write(cdb->file, buf, length); 289 | const cdb_word_t n = cdb->position + r; 290 | if (cdb_overflow_check(cdb, n < cdb->position) < 0) 291 | return 0; 292 | if (r != length) 293 | return cdb_error(cdb, CDB_ERROR_WRITE_E); 294 | cdb->position = n; 295 | return r; 296 | } 297 | 298 | static inline void cdb_pack(uint8_t b[/*static (sizeof (cdb_word_t))*/], cdb_word_t w, size_t l) { 299 | cdb_assert(b); 300 | for (size_t i = 0; i < l; i++) 301 | b[i] = (w >> (i * CHAR_BIT)) & 0xFFu; 302 | } 303 | 304 | static inline cdb_word_t cdb_unpack(uint8_t b[/*static (sizeof (cdb_word_t))*/], size_t l) { 305 | cdb_assert(b); 306 | cdb_word_t w = 0; 307 | for (size_t i = 0; i < l; i++) 308 | w |= ((cdb_word_t)b[i]) << (i * CHAR_BIT); 309 | return w; 310 | } 311 | 312 | int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2) { 313 | cdb_assert(cdb); 314 | cdb_assert(w1); 315 | cdb_assert(w2); 316 | const size_t l = cdb_get_size(cdb); 317 | /* we only need to set this to 'b' to a value to avoid static checkers 318 | * signalling a problem, 'b' should be written to be 319 | * 'cdb_read_internal' before it is used. */ 320 | uint8_t b[2ul * sizeof(cdb_word_t)] = { 0, }; 321 | const cdb_word_t r = cdb_read_internal(cdb, b, 2ul * l); 322 | if (r != (cdb_word_t)(2l * l)) 323 | return -1; 324 | *w1 = cdb_unpack(b, l); 325 | *w2 = cdb_unpack(b + l, l); 326 | return 0; 327 | } 328 | 329 | static int cdb_write_word_pair(cdb_t *cdb, const cdb_word_t w1, const cdb_word_t w2) { 330 | cdb_assert(cdb); 331 | const size_t l = cdb_get_size(cdb); 332 | uint8_t b[2ul * sizeof(cdb_word_t)]; /* NOT INITIALIZED */ 333 | cdb_pack(b, w1, l); 334 | cdb_pack(b + l, w2, l); 335 | if (cdb_write(cdb, b, 2ul * l) != (2ul * l)) 336 | return -1; 337 | return 0; 338 | } 339 | 340 | static int cdb_hash_free(cdb_t *cdb, cdb_hash_table_t *t) { 341 | cdb_assert(cdb); 342 | cdb_assert(t); 343 | const int r1 = cdb_free(cdb, t->hashes); 344 | const int r2 = cdb_free(cdb, t->fps); 345 | t->hashes = NULL; 346 | t->fps = NULL; 347 | /* do not free t */ 348 | return r1 < 0 || r2 < 0 ? -1 : 0; 349 | } 350 | 351 | static int cdb_free_resources(cdb_t *cdb) { 352 | if (!cdb) 353 | return 0; 354 | if (cdb->file) 355 | cdb->ops.close(cdb->file); 356 | cdb->file = NULL; 357 | cdb->opened = 0; 358 | int r = 0; 359 | for (size_t i = 0; cdb->create && i < CDB_BUCKETS; i++) 360 | if (cdb_hash_free(cdb, &cdb->table1[i]) < 0) 361 | r = -1; 362 | (void)cdb_error(cdb, CDB_ERROR_E); 363 | (void)cdb->ops.allocator(cdb->ops.arena, cdb, 0, 0); 364 | return r; 365 | } 366 | 367 | static inline int cdb_finalize(cdb_t *cdb) { /* write hash tables to disk */ 368 | cdb_assert(cdb); 369 | cdb_assert(cdb->error == 0); 370 | cdb_assert(cdb->create == 1); 371 | if (CDB_WRITE_ON == 0) 372 | return cdb_error(cdb, CDB_ERROR_DISABLED_E); 373 | int r = 0; 374 | cdb_word_t mlen = 8; 375 | cdb_word_t *hashes = cdb_allocate(cdb, mlen * sizeof *hashes); 376 | cdb_word_t *positions = cdb_allocate(cdb, mlen * sizeof *positions); 377 | if (!hashes || !positions) 378 | goto fail; 379 | /* NB. No need to seek as we are the only thing that can affect 380 | * cdb->position in write mode */ 381 | cdb->hash_start = cdb->position; 382 | 383 | for (size_t i = 0; i < CDB_BUCKETS; i++) { /* write tables at end of file */ 384 | cdb_hash_table_t *t = &cdb->table1[i]; 385 | const cdb_word_t length = t->header.length * 2ul; 386 | t->header.position = cdb->position; /* needs to be set */ 387 | if (length == 0) 388 | continue; 389 | if (cdb_bound_check(cdb, length < t->header.length) < 0) 390 | goto fail; 391 | if (mlen < length) { 392 | const cdb_word_t required = length * sizeof (cdb_word_t); 393 | if (cdb_overflow_check(cdb, required < length) < 0) 394 | goto fail; 395 | cdb_word_t *t1 = cdb_reallocate(cdb, hashes, required); 396 | if (!t1) 397 | goto fail; 398 | hashes = t1; 399 | cdb_word_t *t2 = cdb_reallocate(cdb, positions, required); 400 | if (!t2) 401 | goto fail; 402 | positions = t2; 403 | mlen = length; 404 | } 405 | 406 | memset(hashes, 0, length * sizeof (cdb_word_t)); 407 | memset(positions, 0, length * sizeof (cdb_word_t)); 408 | 409 | for (size_t j = 0; j < t->header.length; j++) { 410 | const cdb_word_t h = t->hashes[j]; 411 | const cdb_word_t p = t->fps[j]; 412 | const cdb_word_t start = (h >> CDB_NBUCKETS) % length; 413 | cdb_word_t k = 0; 414 | for (k = start; positions[k]; k = (k + 1ul) % length) 415 | ; 416 | hashes[k] = h; 417 | positions[k] = p; 418 | } 419 | 420 | for (cdb_word_t j = 0; j < length; j++) 421 | if (cdb_write_word_pair(cdb, hashes[j], positions[j]) < 0) 422 | goto fail; 423 | } 424 | cdb->file_end = cdb->position; 425 | if (cdb_seek_internal(cdb, cdb->file_start) < 0) 426 | goto fail; 427 | for (size_t i = 0; i < CDB_BUCKETS; i++) { /* write initial hash table */ 428 | const cdb_hash_table_t * const t = &cdb->table1[i]; 429 | if (cdb_write_word_pair(cdb, t->header.position, (t->header.length * 2ul)) < 0) 430 | goto fail; 431 | } 432 | if (cdb_free(cdb, hashes) < 0) 433 | r = -1; 434 | if (cdb_free(cdb, positions) < 0) 435 | r = -1; 436 | return r == 0 && cdb->ops.flush ? cdb->ops.flush(cdb->file) : r; 437 | fail: 438 | (void)cdb_free(cdb, hashes); 439 | (void)cdb_free(cdb, positions); 440 | return cdb_error(cdb, CDB_ERROR_E); 441 | } 442 | 443 | int cdb_close(cdb_t *cdb) { /* free cdb, close (and write to disk if in create mode) */ 444 | if (!cdb) 445 | return 0; 446 | if (cdb->error) 447 | goto fail; 448 | if (cdb->create) 449 | if (cdb_finalize(cdb) < 0) 450 | goto fail; 451 | return cdb_free_resources(cdb); 452 | fail: 453 | (void)cdb_free_resources(cdb); 454 | return CDB_ERROR_E; 455 | } 456 | int cdb_open(cdb_t **cdb, const cdb_options_t *ops, const int create, const char *file) { 457 | /* We could allow the word size of the CDB database {16, 32 (default) or 64} 458 | * to be configured at run time and not compile time, this has API related 459 | * consequences, the size of 'cdb_word_t' would determine the maximum size that 460 | * could be supported by this library. 'cdb_open' would have to take another 461 | * parameter or one of the structures passed in would need to be extended. */ 462 | cdb_assert(cdb); 463 | cdb_assert(ops); 464 | cdb_assert(ops->allocator); 465 | cdb_assert(ops->read); 466 | cdb_assert(ops->open); 467 | cdb_assert(ops->close); 468 | cdb_assert(ops->seek); 469 | cdb_implies(create, ops->write); 470 | CDB_BUILD_BUG_ON(CHAR_BIT != 8); 471 | /* ops->flush is optional */ 472 | *cdb = NULL; 473 | if (create && CDB_WRITE_ON == 0) 474 | return CDB_ERROR_E; 475 | if (ops->size != 0 && ops->size != 16 && ops->size != 32 && ops->size != 64) 476 | return CDB_ERROR_SIZE_E; 477 | if (ops->size != 0 && ops->size > (sizeof(cdb_word_t) * CHAR_BIT)) 478 | return CDB_ERROR_SIZE_E; 479 | cdb_t *c = NULL; 480 | const int large = CDB_MEMORY_INDEX_ON || create; 481 | const size_t csz = (sizeof *c) + (large * sizeof c->table1[0] * CDB_BUCKETS); 482 | c = ops->allocator(ops->arena, NULL, 0, csz); 483 | if (!c) 484 | goto fail; 485 | memset(c, 0, csz); 486 | c->ops = *ops; 487 | const cdb_hash_fn hash_fn = c->ops.size >= 64 ? cdb_hash64 : cdb_hash; 488 | c->ops.size = c->ops.size ? c->ops.size / CHAR_BIT : (32ul / CHAR_BIT); 489 | c->ops.hash = c->ops.hash ? c->ops.hash : hash_fn; 490 | c->ops.compare = c->ops.compare ? c->ops.compare : cdb_memory_compare; 491 | c->create = create; 492 | c->empty = 1; 493 | *cdb = c; 494 | c->file_start = CDB_FILE_START; 495 | c->file = c->ops.open(file, create ? CDB_RW_MODE : CDB_RO_MODE); 496 | if (!(c->file)) { 497 | (void)cdb_error(c, CDB_ERROR_OPEN_E); 498 | goto fail; 499 | } 500 | if (cdb_seek_internal(c, c->file_start) < 0) 501 | goto fail; 502 | if (create) { 503 | for (size_t i = 0; i < CDB_BUCKETS; i++) /* write empty header */ 504 | if (cdb_write_word_pair(c, 0, 0) < 0) 505 | goto fail; 506 | } else { 507 | /* We allocate more memory than we need if CDB_MEMORY_INDEX_ON is 508 | * true as 'cdb_hash_table_t' contains entries needed for 509 | * creation that we do not need when reading the database. */ 510 | cdb_word_t hpos = 0, hlen = 0, lpos = -1l, lset = 0, prev = 0, pnum = 0; 511 | for (size_t i = 0; i < CDB_BUCKETS; i++) { 512 | cdb_hash_table_t t = { .header = { .position = 0, .length = 0 } }; 513 | if (cdb_read_word_pair(c, &t.header.position, &t.header.length) < 0) 514 | goto fail; 515 | if (i && t.header.position != (prev + (pnum * (2ul * cdb_get_size(c))))) 516 | goto fail; 517 | prev = t.header.position; 518 | pnum = t.header.length; 519 | if (CDB_MEMORY_INDEX_ON) 520 | c->table1[i] = t; 521 | if (t.header.length) 522 | c->empty = 0; 523 | if (t.header.length && t.header.position < lpos) { 524 | lpos = t.header.position; 525 | lset = 1; 526 | } 527 | if (t.header.position > hpos) { 528 | hpos = t.header.position; 529 | hlen = t.header.length; 530 | } 531 | } 532 | if (cdb_seek_internal(c, c->file_start) < 0) 533 | goto fail; 534 | c->file_end = hpos + (hlen * (2ul * cdb_get_size(c))); 535 | c->hash_start = lset ? lpos : (CDB_BUCKETS * (2ul * cdb_get_size(c))); 536 | if (lset) { 537 | if (cdb_bound_check(c, c->file_start > lpos) < 0) 538 | goto fail; 539 | } 540 | if (cdb_overflow_check(c, c->file_end < hpos) < 0) 541 | goto fail; 542 | } 543 | c->opened = 1; 544 | return CDB_OK_E; 545 | fail: 546 | (void)cdb_close(c); 547 | return CDB_ERROR_E; 548 | } 549 | 550 | /* returns: -1 = error, 0 = not equal, 1 = equal */ 551 | static int cdb_compare(cdb_t *cdb, const cdb_buffer_t *k1, const cdb_file_pos_t *k2) { 552 | cdb_assert(cdb); 553 | cdb_assert(cdb->ops.compare); 554 | cdb_assert(k1); 555 | cdb_assert(k2); 556 | if (k1->length != k2->length) 557 | return CDB_NOT_FOUND_E; /* not equal */ 558 | const cdb_word_t length = k1->length; 559 | if (cdb_seek_internal(cdb, k2->position) < 0) 560 | return CDB_ERROR_E; 561 | for (cdb_word_t i = 0; i < length; i += CDB_READ_BUFFER_LENGTH) { 562 | /* Note that making this buffer larger may not make things faster - if 563 | * most keys differ in the first few bytes then a smaller buffer means 564 | * less bytes moved around before the comparison. */ 565 | uint8_t kbuf[CDB_READ_BUFFER_LENGTH]; 566 | CDB_BUILD_BUG_ON(sizeof kbuf != CDB_READ_BUFFER_LENGTH); 567 | const cdb_word_t rl = CDB_MIN((cdb_word_t)sizeof kbuf, (cdb_word_t)length - i); 568 | if (cdb_read_internal(cdb, kbuf, rl) != rl) 569 | return CDB_ERROR_E; 570 | if (cdb->ops.compare(k1->buffer + i, kbuf, rl)) 571 | return CDB_NOT_FOUND_E; 572 | } 573 | return CDB_FOUND_E; /* equal */ 574 | } 575 | 576 | static int cdb_retrieve(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t *record) { 577 | cdb_assert(cdb); 578 | cdb_assert(cdb->opened); 579 | cdb_assert(cdb->ops.hash); 580 | cdb_assert(key); /* If key was NULL, we *could* lookup the values instead using cdb_foreach */ 581 | cdb_assert(value); 582 | cdb_assert(record); 583 | cdb_word_t pos = 0, num = 0, h = 0; 584 | uint64_t wanted = *record, recno = 0; 585 | *record = 0; 586 | *value = (cdb_file_pos_t) { 0, 0, }; 587 | if (cdb->error) 588 | goto fail; 589 | if (cdb->create) { 590 | (void)cdb_error(cdb, CDB_ERROR_MODE_E); 591 | goto fail; 592 | } 593 | /* It is usually a good idea to include the length as part of the data 594 | * of the hash, however that would make the format incompatible. */ 595 | h = cdb->ops.hash((uint8_t *)(key->buffer), key->length) & cdb_get_mask(cdb); /* locate key in first table */ 596 | if (CDB_MEMORY_INDEX_ON) { /* use more memory (~4KiB) to speed up first match */ 597 | cdb_hash_table_t *t = &cdb->table1[h % CDB_BUCKETS]; 598 | pos = t->header.position; 599 | num = t->header.length; 600 | } else { 601 | if (cdb_seek_internal(cdb, cdb->file_start + ((h % CDB_BUCKETS) * (2ul * cdb_get_size(cdb)))) < 0) 602 | goto fail; 603 | if (cdb_read_word_pair(cdb, &pos, &num) < 0) 604 | goto fail; 605 | } 606 | if (num == 0) /* no keys in this bucket -> key not found */ 607 | return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E; 608 | if (cdb_bound_check(cdb, pos > cdb->file_end || pos < cdb->hash_start) < 0) 609 | goto fail; 610 | const cdb_word_t start = (h >> CDB_NBUCKETS) % num; 611 | for (cdb_word_t i = 0; i < num; i++) { 612 | const cdb_word_t seekpos = pos + (((start + i) % num) * (2ul * cdb_get_size(cdb))); 613 | if (seekpos < pos || seekpos > cdb->file_end) 614 | goto fail; 615 | if (cdb_seek_internal(cdb, seekpos) < 0) 616 | goto fail; 617 | cdb_word_t h1 = 0, p1 = 0; 618 | if (cdb_read_word_pair(cdb, &h1, &p1) < 0) 619 | goto fail; 620 | if (cdb_bound_check(cdb, p1 > cdb->hash_start) < 0) /* key-value pair should not overlap with hash tables section */ 621 | goto fail; 622 | if (p1 == 0) { /* end of list */ 623 | *record = recno; 624 | return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E; 625 | } 626 | if (cdb_hash_check(cdb, (h1 & 0xFFul) != (h & 0xFFul)) < 0) /* buckets bits should be the same */ 627 | goto fail; 628 | if (h1 == h) { /* possible match */ 629 | if (cdb_seek_internal(cdb, p1) < 0) 630 | goto fail; 631 | cdb_word_t klen = 0, vlen = 0; 632 | if (cdb_read_word_pair(cdb, &klen, &vlen) < 0) 633 | goto fail; 634 | const cdb_file_pos_t k2 = { .length = klen, .position = p1 + (2ul * cdb_get_size(cdb)) }; 635 | if (cdb_overflow_check(cdb, k2.position < p1 || (k2.position + klen) < k2.position) < 0) 636 | goto fail; 637 | if (cdb_bound_check(cdb, k2.position + klen > cdb->hash_start) < 0) 638 | goto fail; 639 | const int comp = cdb_compare(cdb, key, &k2); 640 | const int found = comp > 0; 641 | if (comp < 0) 642 | goto fail; 643 | if (found && recno == wanted) { /* found key, correct record? */ 644 | cdb_file_pos_t v2 = { .length = vlen, .position = k2.position + klen }; 645 | if (cdb_overflow_check(cdb, (v2.position + v2.length) < v2.position) < 0) 646 | goto fail; 647 | if (cdb_bound_check(cdb, v2.position > cdb->hash_start) < 0) 648 | goto fail; 649 | if (cdb_bound_check(cdb, (v2.position + v2.length) > cdb->hash_start) < 0) 650 | goto fail; 651 | *value = v2; 652 | *record = recno; 653 | return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_FOUND_E; 654 | } 655 | recno += found; 656 | } 657 | } 658 | *record = recno; 659 | return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E; 660 | fail: 661 | return cdb_error(cdb, CDB_ERROR_E); 662 | } 663 | 664 | int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t record) { 665 | cdb_assert(cdb); 666 | cdb_assert(cdb->opened); 667 | cdb_assert(key); 668 | cdb_assert(value); 669 | return cdb_retrieve(cdb, key, value, &record); 670 | } 671 | 672 | int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value) { 673 | cdb_assert(cdb); 674 | cdb_assert(cdb->opened); 675 | cdb_assert(key); 676 | cdb_assert(value); 677 | return cdb_lookup(cdb, key, value, 0l); 678 | } 679 | 680 | int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, uint64_t *count) { 681 | cdb_assert(cdb); 682 | cdb_assert(cdb->opened); 683 | cdb_assert(key); 684 | cdb_assert(count); 685 | cdb_file_pos_t value = { 0, 0, }; 686 | uint64_t c = UINT64_MAX; 687 | const int r = cdb_retrieve(cdb, key, &value, &c); 688 | c = r == CDB_FOUND_E ? c + 1l : c; 689 | *count = c; 690 | return r; 691 | } 692 | 693 | int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param) { 694 | cdb_assert(cdb); 695 | cdb_assert(cdb->opened); 696 | if (cdb->error || cdb->create) 697 | goto fail; 698 | cdb_word_t pos = cdb->file_start + (256ul * (2ul * cdb_get_size(cdb))); 699 | int r = 0; 700 | for (;pos < cdb->hash_start;) { 701 | if (cdb_seek_internal(cdb, pos) < 0) 702 | goto fail; 703 | cdb_word_t klen = 0, vlen = 0; 704 | if (cdb_read_word_pair(cdb, &klen, &vlen) < 0) 705 | goto fail; 706 | const cdb_file_pos_t key = { .length = klen, .position = pos + (2ul * cdb_get_size(cdb)), }; 707 | const cdb_file_pos_t value = { .length = vlen, .position = pos + (2ul * cdb_get_size(cdb)) + klen, }; 708 | if (cdb_bound_check(cdb, value.position > cdb->hash_start) < 0) 709 | goto fail; 710 | if (cdb_bound_check(cdb, (value.position + value.length) > cdb->hash_start) < 0) 711 | goto fail; 712 | r = cb ? cb(cdb, &key, &value, param) : 0; 713 | if (r < 0) 714 | goto fail; 715 | if (r > 0) /* early termination */ 716 | break; 717 | pos = value.position + value.length; 718 | } 719 | return cdb_failure(cdb) < 0 ? CDB_ERROR_E : r; 720 | fail: 721 | return cdb_error(cdb, CDB_ERROR_E); 722 | } 723 | 724 | static int cdb_round_up_to_next_power_of_two(const cdb_word_t x) { 725 | cdb_word_t p = 1ul; 726 | while (p < x) 727 | p <<= 1ul; 728 | return p; 729 | } 730 | 731 | static int cdb_hash_grow(cdb_t *cdb, const cdb_word_t hash, const cdb_word_t position) { 732 | cdb_assert(cdb); 733 | cdb_hash_table_t *t1 = &cdb->table1[hash % CDB_BUCKETS]; 734 | cdb_word_t *hashes = t1->hashes, *fps = t1->fps; 735 | const cdb_word_t next = cdb_round_up_to_next_power_of_two(t1->header.length + 1ul); 736 | const cdb_word_t cur = cdb_round_up_to_next_power_of_two(t1->header.length); 737 | if (cdb_overflow_check(cdb, (t1->header.length + 1ul) < t1->header.length) < 0) 738 | return CDB_ERROR_E; 739 | if (next > cur || t1->header.length == 0) { 740 | const cdb_word_t alloc = t1->header.length == 0 ? 1ul : t1->header.length * 2ul; 741 | if (cdb_overflow_check(cdb, (t1->header.length * 2ul) < t1->header.length) < 0) 742 | return CDB_ERROR_E; 743 | if (!(hashes = cdb_reallocate(cdb, t1->hashes, alloc * sizeof (*t1->hashes)))) 744 | return CDB_ERROR_E; 745 | t1->hashes = hashes; 746 | if (!(fps = cdb_reallocate(cdb, t1->fps, alloc * sizeof (*t1->fps)))) { 747 | (void)cdb_hash_free(cdb, t1); 748 | return CDB_ERROR_E; 749 | } 750 | } 751 | t1->hashes = hashes; 752 | t1->fps = fps; 753 | t1->hashes[t1->header.length] = hash; 754 | t1->fps[t1->header.length] = position; 755 | t1->header.length++; 756 | return cdb_failure(cdb); 757 | } 758 | 759 | /* Duplicate keys can be added. To prevent this the library could easily be 760 | * improved in a backwards compatible way by extending the options structure 761 | * to include a new options value that would specify if adding duplicate keys 762 | * is allowed (adding values to the end of a structure being backwards 763 | * compatible in (most/all?) C ABIs). "cdb_add" would then need to be extended 764 | * to check for duplicate keys, which would be the difficult bit, a new lookup 765 | * function would need to be designed that could query the partially written 766 | * database. */ 767 | int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value) { 768 | cdb_preconditions(cdb); 769 | cdb_assert(cdb->opened); 770 | cdb_assert(cdb->ops.hash); 771 | cdb_assert(key); 772 | cdb_assert(value); 773 | cdb_assert(cdb->position >= cdb->file_start); 774 | if (CDB_WRITE_ON == 0) 775 | return cdb_error(cdb, CDB_ERROR_DISABLED_E); 776 | if (cdb->error) 777 | goto fail; 778 | if (cdb->create == 0) { 779 | (void)cdb_error(cdb, CDB_ERROR_MODE_E); 780 | goto fail; 781 | } 782 | if (cdb_overflow_check(cdb, (key->length + value->length) < key->length) < 0) 783 | goto fail; 784 | const cdb_word_t h = cdb->ops.hash((uint8_t*)(key->buffer), key->length) & cdb_get_mask(cdb); 785 | if (cdb_hash_grow(cdb, h, cdb->position) < 0) 786 | goto fail; 787 | if (cdb_seek_internal(cdb, cdb->position) < 0) 788 | goto fail; 789 | if (cdb_write_word_pair(cdb, key->length, value->length) < 0) 790 | goto fail; 791 | if (cdb_write(cdb, key->buffer, key->length) != key->length) 792 | goto fail; 793 | if (cdb_write(cdb, value->buffer, value->length) != value->length) 794 | goto fail; 795 | cdb->empty = 0; 796 | return cdb_failure(cdb); 797 | fail: 798 | return cdb_error(cdb, CDB_ERROR_E); 799 | } 800 | 801 | uint64_t cdb_prng(uint64_t s[2]) { /* XORSHIFT128: A few rounds of SPECK or TEA ciphers also make good PRNGs */ 802 | cdb_assert(s); 803 | if (!s[0] && !s[1]) 804 | s[0] = 1; 805 | uint64_t a = s[0]; 806 | const uint64_t b = s[1]; 807 | s[0] = b; 808 | a ^= a << 23; 809 | a ^= a >> 18; 810 | a ^= b; 811 | a ^= b >> 5; 812 | s[1] = a; 813 | return a + b; 814 | } 815 | 816 | #define CDB_TEST_VECTOR_LEN (1024ul) 817 | 818 | /* A series of optional unit tests that can be compiled out 819 | * of the program, the function will still remain even if the 820 | * contents of it are elided. */ 821 | int cdb_tests(const cdb_options_t *ops, const char *test_file) { 822 | cdb_assert(ops); 823 | cdb_assert(test_file); 824 | CDB_BUILD_BUG_ON(sizeof (cdb_word_t) < 2); 825 | 826 | /* See readme.md for description of this and why this 827 | * is the way it is. Note that if "CDB_TESTS_ON" is 828 | * zero the rest of the code will be removed by the 829 | * compiler though. */ 830 | if (CDB_TESTS_ON == 0) 831 | return CDB_OK_E; 832 | 833 | const size_t l = ops->size; 834 | const size_t vectors = l == 16ul ? 128ul : CDB_TEST_VECTOR_LEN; 835 | const size_t klen = l == 16ul ? 64ul : CDB_TEST_VECTOR_LEN; 836 | const size_t vlen = l == 16ul ? 64ul : CDB_TEST_VECTOR_LEN; 837 | 838 | typedef struct { 839 | char key[CDB_TEST_VECTOR_LEN], value[CDB_TEST_VECTOR_LEN], result[CDB_TEST_VECTOR_LEN]; 840 | uint64_t recno; 841 | cdb_word_t klen, vlen; 842 | } test_t; 843 | 844 | typedef struct { char *key, *value; } test_duplicate_t; 845 | 846 | static const test_duplicate_t dups[] = { /* add known duplicates */ 847 | { "ALPHA", "BRAVO", }, 848 | { "ALPHA", "CHARLIE", }, 849 | { "ALPHA", "DELTA", }, 850 | { "FSF", "Collide-1", }, 851 | { "Aug", "Collide-2", }, 852 | { "FSF", "Collide-3", }, 853 | { "Aug", "Collide-4", }, 854 | { "revolves", "Collide-1", }, 855 | { "revolt's", "Collide-2", }, 856 | { "revolt's", "Collide-3", }, 857 | { "revolt's", "Collide-4", }, 858 | { "revolves", "Collide-5", }, 859 | { "revolves", "Collide-6", }, 860 | { "1234", "5678", }, 861 | { "1234", "9ABC", }, 862 | { "", "", }, 863 | { "", "X", }, 864 | { "", "", }, 865 | }; 866 | const size_t dupcnt = sizeof (dups) / sizeof (dups[0]); 867 | 868 | cdb_t *cdb = NULL; 869 | test_t *ts = NULL; 870 | uint64_t s[2] = { 0, }; 871 | int r = CDB_OK_E; 872 | 873 | if (cdb_open(&cdb, ops, 1, test_file) < 0) 874 | return CDB_ERROR_E; 875 | 876 | if (!(ts = cdb_allocate(cdb, (dupcnt + vectors) * (sizeof *ts)))) 877 | goto fail; 878 | 879 | for (unsigned i = 0; i < vectors; i++) { 880 | char *k = ts[i].key; 881 | char *v = ts[i].value; 882 | const cdb_word_t kl = (cdb_prng(s) % (klen - 1ul)) + 1ul; 883 | const cdb_word_t vl = (cdb_prng(s) % (vlen - 1ul)) + 1ul; 884 | for (unsigned long j = 0; j < kl; j++) 885 | k[j] = 'a' + (cdb_prng(s) % 26); /* this is biased, so what, fight me */ 886 | for (unsigned long j = 0; j < vl; j++) 887 | v[j] = 'a' + (cdb_prng(s) % 26); 888 | const cdb_buffer_t key = { .length = kl, .buffer = k }; 889 | const cdb_buffer_t value = { .length = vl, .buffer = v }; 890 | for (unsigned long j = 0; j < i; j++) 891 | if (memcmp(ts[i].value, ts[j].value, vlen) == 0) 892 | ts[i].recno++; 893 | if (cdb_add(cdb, &key, &value) < 0) 894 | goto fail; 895 | ts[i].klen = kl; 896 | ts[i].vlen = vl; 897 | } 898 | 899 | for (size_t i = 0; i < dupcnt; i++) { 900 | test_duplicate_t d = dups[i]; 901 | const cdb_buffer_t key = { .length = strlen(d.key), .buffer = d.key }; 902 | const cdb_buffer_t value = { .length = strlen(d.value), .buffer = d.value }; 903 | 904 | memcpy(ts[i + vectors].key, key.buffer, key.length); 905 | memcpy(ts[i + vectors].value, value.buffer, value.length); 906 | 907 | for (unsigned long j = 0; j < i; j++) 908 | if (memcmp(ts[i].value, ts[j].value, vlen) == 0) 909 | ts[i].recno++; 910 | 911 | if (cdb_add(cdb, &key, &value) < 0) 912 | goto fail; 913 | } 914 | 915 | 916 | if (cdb_close(cdb) < 0) { 917 | (void)ops->allocator(ops->arena, ts, 0, 0); 918 | return -1; 919 | } 920 | cdb = NULL; 921 | 922 | if (cdb_open(&cdb, ops, 0, test_file) < 0) { 923 | (void)ops->allocator(ops->arena, ts, 0, 0); 924 | return -1; 925 | } 926 | 927 | for (unsigned i = 0; i < (vectors + dupcnt); i++) { 928 | test_t *t = &ts[i]; 929 | const cdb_buffer_t key = { .length = t->klen, .buffer = t->key }; 930 | cdb_file_pos_t result = { 0, 0 }, discard = { 0, 0 }; 931 | const int g = cdb_lookup(cdb, &key, &result, t->recno); 932 | if (g < 0) 933 | goto fail; 934 | if (g == CDB_NOT_FOUND_E) { 935 | r = -3; /* -2 not used */ 936 | continue; 937 | } 938 | 939 | const int d = cdb_get(cdb, &key, &discard); 940 | if (d < 0) 941 | goto fail; 942 | if (d == CDB_NOT_FOUND_E) 943 | r = -4; 944 | 945 | if (result.length > vlen) 946 | goto fail; 947 | if (result.length != t->vlen) { 948 | r = -5; 949 | } else { 950 | if (cdb_seek_internal(cdb, result.position) < 0) 951 | goto fail; 952 | if (cdb_read_internal(cdb, t->result, result.length) != result.length) 953 | goto fail; 954 | if (memcmp(t->result, t->value, result.length)) 955 | r = -6; 956 | } 957 | 958 | uint64_t cnt = 0; 959 | if (cdb_count(cdb, &key, &cnt) < 0) 960 | goto fail; 961 | if (cnt < t->recno) 962 | r = -7; 963 | } 964 | 965 | if (cdb_free(cdb, ts) < 0) 966 | r = -1; 967 | if (cdb_close(cdb) < 0) 968 | r = -1; 969 | return r; 970 | fail: 971 | (void)ops->allocator(ops->arena, ts, 0, 0); 972 | (void)cdb_close(cdb); 973 | return CDB_ERROR_E; 974 | } 975 | 976 | -------------------------------------------------------------------------------- /cdb.h: -------------------------------------------------------------------------------- 1 | /* Consult the "readme.md" file in the repository for a detailed 2 | * description of the API and the internals. */ 3 | #ifndef CDB_H 4 | #define CDB_H 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | #define CDB_PROJECT "Constant Database" 10 | #define CDB_AUTHOR "Richard James Howe" 11 | #define CDB_EMAIL "howe.r.j.89@gmail.com" 12 | #define CDB_LICENSE "The Unlicense" 13 | #define CDB_REPO "https://github.com/howerj/cdb" 14 | 15 | #include 16 | #include 17 | 18 | #ifndef CDB_API 19 | #define CDB_API /* Used to apply attributes to exported functions */ 20 | #endif 21 | 22 | #ifndef CDB_WORD_T 23 | typedef uint64_t cdb_word_t; /* valid sizes: uint64_t, uint32_t, uint16_t */ 24 | #endif 25 | 26 | struct cdb; 27 | typedef struct cdb cdb_t; 28 | 29 | enum { CDB_RO_MODE, CDB_RW_MODE, }; /* passed to "open" in the "mode" option */ 30 | 31 | typedef struct { 32 | void *(*allocator)(void *arena, void *ptr, size_t oldsz, size_t newsz); 33 | cdb_word_t (*hash)(const uint8_t *data, size_t length); /* hash function: NULL defaults to djb hash */ 34 | int (*compare)(const void *a, const void *b, size_t length); /* key comparison function: NULL defaults to memcmp */ 35 | cdb_word_t (*read)(void *file, void *buf, size_t length); /* always needed, read from a resource */ 36 | cdb_word_t (*write)(void *file, void *buf, size_t length); /* (conditionally optional) needed for db creation only, write to a resource */ 37 | int (*seek)(void *file, uint64_t offset); /* "tell" is not needed as we keep track of the file position internally */ 38 | void *(*open)(const char *name, int mode); /* open up a resource, which may or may not be a file, for reading (mode = CDB_RO_MODE) or read/write (mode = CDB_RW_MODE) */ 39 | int (*close)(void *file); /* close a resource opened up with "open" */ 40 | int (*flush)(void *file); /* (optional) called at end of successful creation */ 41 | 42 | void *arena; /* used for 'arena' argument for the allocator, can be NULL if allocator allows it */ 43 | cdb_word_t offset; /* starting offset for CDB file if not at beginning of file */ 44 | unsigned size; /* Either 0 (defaults 32), 16, 32 or 64, but cannot be bigger than 'sizeof(cdb_word_t)*8' in any case */ 45 | } cdb_options_t; /* a file abstraction layer, could point to memory, flash, or disk */ 46 | 47 | typedef struct { 48 | cdb_word_t length; /* length of data */ 49 | char *buffer; /* pointer to arbitrary data */ 50 | } cdb_buffer_t; /* used to represent a key or value in memory */ 51 | 52 | typedef struct { 53 | cdb_word_t position; /* position in file, for use with cdb_read/cdb_seek */ 54 | cdb_word_t length; /* length of data on disk, for use with cdb_read */ 55 | } cdb_file_pos_t; /* used to represent a value on disk that can be accessed via 'cdb_options_t' */ 56 | 57 | typedef int (*cdb_callback)(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param); 58 | 59 | /* All functions return: < 0 on failure, 0 on success/not found, 1 on found if applicable */ 60 | CDB_API int cdb_open(cdb_t **cdb, const cdb_options_t *ops, int create, const char *file); /* arena may be NULL, allocator must be present */ 61 | CDB_API int cdb_close(cdb_t *cdb); /* free cdb, close handles (and write to disk if in create mode) */ 62 | CDB_API int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length); /* Returns error code not length! Not being able to read "length" bytes is an error! */ 63 | CDB_API int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value); /* do not call cdb_read and/or cdb_seek in open mode */ 64 | CDB_API int cdb_seek(cdb_t *cdb, cdb_word_t position); 65 | CDB_API int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param); 66 | CDB_API int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2); 67 | CDB_API int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value); 68 | CDB_API int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t record); 69 | CDB_API int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, uint64_t *count); 70 | CDB_API int cdb_status(cdb_t *cdb); /* returns CDB error status */ 71 | CDB_API int cdb_version(unsigned long *version); /* version number in x.y.z format, z = LSB, MSB is library info */ 72 | CDB_API int cdb_tests(const cdb_options_t *ops, const char *test_file); 73 | 74 | CDB_API uint64_t cdb_prng(uint64_t s[2]); /* "s" is PRNG state, you can set it to any value you like to seed */ 75 | CDB_API cdb_word_t cdb_hash(const uint8_t *data, size_t length); /* hash used by original CDB program */ 76 | 77 | #ifdef __cplusplus 78 | } 79 | #endif 80 | #endif 81 | -------------------------------------------------------------------------------- /host.c: -------------------------------------------------------------------------------- 1 | #include "cdb.h" 2 | #include "host.h" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define UNUSED(X) ((void)(X)) 9 | 10 | typedef struct { 11 | FILE *handle; 12 | size_t length; 13 | char buffer[]; 14 | } file_t; 15 | 16 | static void *cdb_allocator_cb(void *arena, void *ptr, const size_t oldsz, const size_t newsz) { 17 | UNUSED(arena); 18 | if (newsz == 0) { 19 | free(ptr); 20 | return NULL; 21 | } 22 | if (newsz > oldsz) 23 | return realloc(ptr, newsz); 24 | return ptr; 25 | } 26 | 27 | static cdb_word_t cdb_read_cb(void *file, void *buf, size_t length) { 28 | assert(file); 29 | assert(buf); 30 | assert(((file_t*)file)->handle); 31 | return fread(buf, 1, length, ((file_t*)file)->handle); 32 | } 33 | 34 | static cdb_word_t cdb_write_cb(void *file, void *buf, size_t length) { 35 | assert(file); 36 | assert(buf); 37 | assert(((file_t*)file)->handle); 38 | return fwrite(buf, 1, length, ((file_t*)file)->handle); 39 | } 40 | 41 | static int cdb_seek_cb(void *file, uint64_t offset) { 42 | assert(file); 43 | assert(((file_t*)file)->handle); 44 | return fseek(((file_t*)file)->handle, offset, SEEK_SET); 45 | } 46 | 47 | static void *cdb_open_cb(const char *name, int mode) { 48 | assert(name); 49 | assert(mode == CDB_RO_MODE || mode == CDB_RW_MODE); 50 | const char *mode_string = mode == CDB_RW_MODE ? "wb+" : "rb"; 51 | FILE *f = fopen(name, mode_string); 52 | if (!f) 53 | return f; 54 | const size_t length = 1024ul * 16ul; 55 | file_t *fb = malloc(sizeof (*f) + length); 56 | if (!fb) { 57 | fclose(f); 58 | return NULL; 59 | } 60 | fb->handle = f; 61 | fb->length = length; 62 | if (setvbuf(f, fb->buffer, _IOFBF, fb->length) < 0) { 63 | fclose(f); 64 | free(fb); 65 | return NULL; 66 | } 67 | return fb; 68 | } 69 | 70 | static int cdb_close_cb(void *file) { 71 | assert(file); 72 | assert(((file_t*)file)->handle); 73 | const int r = fclose(((file_t*)file)->handle); 74 | ((file_t*)file)->handle = NULL; 75 | free(file); 76 | return r; 77 | } 78 | 79 | static int cdb_flush_cb(void *file) { 80 | assert(file); 81 | return fflush(((file_t*)file)->handle); 82 | } 83 | 84 | const cdb_options_t cdb_host_options = { 85 | .allocator = cdb_allocator_cb, 86 | .hash = NULL, 87 | .compare = NULL, 88 | .read = cdb_read_cb, 89 | .write = cdb_write_cb, 90 | .seek = cdb_seek_cb, 91 | .open = cdb_open_cb, 92 | .close = cdb_close_cb, 93 | .flush = cdb_flush_cb, 94 | .arena = NULL, 95 | .offset = 0, 96 | .size = 0, /* auto-select */ 97 | }; 98 | 99 | -------------------------------------------------------------------------------- /host.h: -------------------------------------------------------------------------------- 1 | #ifndef CDB_HOST_H 2 | #define CDB_HOST_H 3 | 4 | #include "cdb.h" 5 | 6 | extern const cdb_options_t cdb_host_options; 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /main.c: -------------------------------------------------------------------------------- 1 | /* Program: Constant Database Driver 2 | * Author: Richard James Howe 3 | * Email: howe.r.j.89@gmail.com 4 | * License: Unlicense 5 | * Repo: */ 6 | 7 | #include "cdb.h" 8 | #include "host.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #define UNUSED(X) ((void)(X)) 19 | #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) 20 | #define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) 21 | #define IO_BUFFER_SIZE (1024u) 22 | #define DISTMAX (10ul) 23 | 24 | #ifdef _WIN32 /* Used to unfuck file mode for "Win"dows. Text mode is for losers. */ 25 | #include 26 | #include 27 | #include 28 | static void binary(FILE *f) { _setmode(_fileno(f), _O_BINARY); } /* only platform specific code... */ 29 | #else 30 | static inline void binary(FILE *f) { UNUSED(f); } 31 | #endif 32 | 33 | typedef struct { 34 | unsigned long records; 35 | unsigned long total_key_length, total_value_length; 36 | unsigned long min_key_length, min_value_length; 37 | unsigned long max_key_length, max_value_length; 38 | unsigned long hash_start; 39 | } cdb_statistics_t; 40 | 41 | typedef struct { 42 | char *arg; /* parsed argument */ 43 | int error, /* turn error reporting on/off */ 44 | index, /* index into argument list */ 45 | option, /* parsed option */ 46 | reset; /* set to reset */ 47 | char *place; /* internal use: scanner position */ 48 | int init; /* internal use: initialized or not */ 49 | } cdb_getopt_t; /* getopt clone; with a few modifications */ 50 | 51 | static unsigned verbose = 0; 52 | 53 | static void info(const char *fmt, ...) { 54 | assert(fmt); 55 | if (verbose == 0) 56 | return; 57 | FILE *out = stderr; 58 | va_list ap; 59 | va_start(ap, fmt); 60 | (void)vfprintf(out, fmt, ap); 61 | va_end(ap); 62 | (void)fputc('\n', out); 63 | (void)fflush(out); 64 | } 65 | 66 | static void die(const char *fmt, ...) { 67 | assert(fmt); 68 | FILE *out = stderr; 69 | va_list ap; 70 | va_start(ap, fmt); 71 | (void)vfprintf(out, fmt, ap); 72 | va_end(ap); 73 | (void)fputc('\n', out); 74 | (void)fflush(out); 75 | exit(EXIT_FAILURE); 76 | } 77 | 78 | /* Adapted from: , this 79 | * could be extended to parse out numeric values, and do other things, but 80 | * that is not needed here. The function and structure should be turned 81 | * into a header only library. */ 82 | static int cdb_getopt(cdb_getopt_t *opt, const int argc, char *const argv[], const char *fmt) { 83 | assert(opt); 84 | assert(fmt); 85 | assert(argv); 86 | enum { BADARG_E = ':', BADCH_E = '?', BADIO_E = '!', }; 87 | 88 | if (!(opt->init)) { 89 | opt->place = ""; /* option letter processing */ 90 | opt->init = 1; 91 | opt->index = 1; 92 | } 93 | 94 | if (opt->reset || !*opt->place) { /* update scanning pointer */ 95 | opt->reset = 0; 96 | if (opt->index >= argc || *(opt->place = argv[opt->index]) != '-') { 97 | opt->place = ""; 98 | return -1; 99 | } 100 | if (opt->place[1] && *++opt->place == '-') { /* found "--" */ 101 | opt->index++; 102 | opt->place = ""; 103 | return -1; 104 | } 105 | } 106 | 107 | const char *oli = NULL; /* option letter list index */ 108 | if ((opt->option = *opt->place++) == ':' || !(oli = strchr(fmt, opt->option))) { /* option letter okay? */ 109 | /* if the user didn't specify '-' as an option, assume it means -1. */ 110 | if (opt->option == '-') 111 | return -1; 112 | if (!*opt->place) 113 | opt->index++; 114 | if (opt->error && *fmt != ':') 115 | if (fprintf(stderr, "illegal option -- %c\n", opt->option) < 0) 116 | return BADIO_E; 117 | return BADCH_E; 118 | } 119 | 120 | if (*++oli != ':') { /* don't need argument */ 121 | opt->arg = NULL; 122 | if (!*opt->place) 123 | opt->index++; 124 | } else { /* need an argument */ 125 | if (*opt->place) { /* no white space */ 126 | opt->arg = opt->place; 127 | } else if (argc <= ++opt->index) { /* no arg */ 128 | opt->place = ""; 129 | if (*fmt == ':') 130 | return BADARG_E; 131 | if (opt->error) 132 | if (fprintf(stderr, "option requires an argument -- %c\n", opt->option) < 0) 133 | return BADIO_E; 134 | return BADCH_E; 135 | } else { /* white space */ 136 | opt->arg = argv[opt->index]; 137 | } 138 | opt->place = ""; 139 | opt->index++; 140 | } 141 | return opt->option; /* dump back option letter */ 142 | } 143 | 144 | static int cdb_print(cdb_t *cdb, const cdb_file_pos_t *fp, FILE *output) { 145 | assert(cdb); 146 | assert(fp); 147 | assert(output); 148 | if (cdb_seek(cdb, fp->position) < 0) 149 | return -1; 150 | char buf[IO_BUFFER_SIZE]; 151 | const size_t length = fp->length; 152 | for (size_t i = 0; i < length; i += sizeof buf) { /* N.B. Double buffering! */ 153 | const size_t l = length - i; 154 | if (l > sizeof buf) 155 | return -1; 156 | assert(l <= sizeof buf); 157 | if (cdb_read(cdb, buf, MIN(sizeof buf, l)) < 0) 158 | return -1; 159 | if (fwrite(buf, 1, l, output) != l) 160 | return -1; 161 | } 162 | return 0; 163 | } 164 | 165 | static inline void cdb_reverse_char_array(char * const r, const size_t length) { 166 | assert(r); 167 | const size_t last = length - 1; 168 | for (size_t i = 0; i < length / 2ul; i++) { 169 | const char t = r[i]; 170 | r[i] = r[last - i]; 171 | r[last - i] = t; 172 | } 173 | } 174 | 175 | static unsigned cdb_number_to_string(char b[65 /* max int size in base 2, + NUL*/], cdb_word_t u, int base) { 176 | assert(b); 177 | assert(base >= 2 && base <= 10); 178 | unsigned i = 0; 179 | do { 180 | const cdb_word_t radix = base; 181 | const cdb_word_t q = u % radix; 182 | const cdb_word_t r = u / radix; 183 | b[i++] = q + '0'; 184 | u = r; 185 | assert(i <= 64); 186 | } while (u); 187 | b[i] = '\0'; 188 | cdb_reverse_char_array(b, i); 189 | assert(b[i] == '\0'); 190 | return i; 191 | } 192 | 193 | static int cdb_dump(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) { 194 | assert(cdb); 195 | assert(key); 196 | assert(value); 197 | assert(param); 198 | FILE *output = param; 199 | char kstr[64+1], vstr[64+2]; /* NOT INITIALIZED */ 200 | kstr[0] = '+'; 201 | const unsigned kl = cdb_number_to_string(kstr + 1, key->length, 10) + 1; 202 | vstr[0] = ','; 203 | const unsigned nl = cdb_number_to_string(vstr + 1, value->length, 10) + 1; 204 | if (fwrite(kstr, 1, kl, output) != kl) 205 | return -1; 206 | vstr[nl] = ':'; 207 | vstr[nl + 1] = '\0'; 208 | if (fwrite(vstr, 1, nl + 1, output) != (nl + 1)) 209 | return -1; 210 | if (cdb_print(cdb, key, output) < 0) 211 | return -1; 212 | if (fwrite("->", 1, 2, output) != 2) 213 | return -1; 214 | if (cdb_print(cdb, value, output) < 0) 215 | return -1; 216 | return fputc('\n', output) != '\n' ? -1 : 0; 217 | } 218 | 219 | static int cdb_dump_keys(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) { 220 | assert(cdb); 221 | assert(key); 222 | assert(value); 223 | assert(param); 224 | UNUSED(value); 225 | FILE *output = param; 226 | char kstr[64+2]; /* NOT INITIALIZED */ 227 | kstr[0] = '+'; 228 | const unsigned kl = cdb_number_to_string(kstr + 1, key->length, 10) + 1; 229 | kstr[kl] = ':'; 230 | kstr[kl + 1] = '\0'; 231 | if (fwrite(kstr, 1, kl + 1, output) != (kl + 1)) 232 | return -1; 233 | if (cdb_print(cdb, key, output) < 0) 234 | return -1; 235 | return fputc('\n', output) != '\n' ? -1 : 0; 236 | } 237 | 238 | static int cdb_string_to_number(const char *s, cdb_word_t *out) { 239 | assert(s); 240 | cdb_word_t result = 0; 241 | int ch = s[0]; 242 | *out = 0; 243 | if (!ch) 244 | return -1; 245 | for (size_t j = 0; j < 64 && (ch = s[j]); j++) { 246 | const int digit = ch - '0'; 247 | if (digit < 0 || digit > 9) 248 | return -1; 249 | result = digit + (result * (cdb_word_t)10ul); 250 | } 251 | if (ch) 252 | return -1; 253 | *out = result; 254 | return 0; 255 | } 256 | 257 | static int scan(FILE *input, cdb_word_t *out, int delim) { 258 | assert(input); 259 | char b[64]; /* NOT INITIALIZED */ 260 | size_t i = 0; 261 | int ch = 0; 262 | for (i = 0; i < sizeof (b) && (EOF != (ch = fgetc(input))) && isdigit(ch); i++) 263 | b[i] = ch; 264 | if (i == sizeof(b)) 265 | return -1; 266 | b[i] = '\0'; 267 | if (delim == 0) { 268 | if (ungetc(ch, input) < 0) 269 | return -1; 270 | } else if (ch != delim) { 271 | return -1; 272 | } 273 | return cdb_string_to_number(b, out); 274 | } 275 | 276 | static int cdb_create(cdb_t *cdb, FILE *input) { 277 | assert(cdb); 278 | assert(input); 279 | 280 | int r = 0; 281 | size_t kmlen = IO_BUFFER_SIZE, vmlen = IO_BUFFER_SIZE; 282 | char *key = malloc(kmlen); 283 | char *value = malloc(vmlen); 284 | if (!key || !value) 285 | goto fail; 286 | 287 | for (;;) { 288 | cdb_word_t klen = 0, vlen = 0; 289 | char sep[2] = { 0, }; 290 | const int first = fgetc(input); 291 | if (first == EOF) /* || first == '\n' {need to handle '\r' as well} */ 292 | goto end; 293 | if (isspace(first)) 294 | continue; 295 | if (first != '+') 296 | goto fail; 297 | if (scan(input, &klen, ',') < 0) 298 | goto fail; 299 | if (scan(input, &vlen, ':') < 0) 300 | goto fail; 301 | if (kmlen < klen) { 302 | char *t = realloc(key, klen); 303 | if (!t) 304 | goto fail; 305 | kmlen = klen; 306 | key = t; 307 | } 308 | 309 | if (vmlen < vlen) { 310 | char *t = realloc(value, vlen); 311 | if (!t) 312 | goto fail; 313 | vmlen = vlen; 314 | value = t; 315 | } 316 | 317 | if (fread(key, 1, klen, input) != klen) 318 | goto fail; 319 | 320 | if (fread(sep, 1, sizeof sep, input) != sizeof sep) 321 | goto fail; 322 | 323 | if (sep[0] != '-' || sep[1] != '>') 324 | goto fail; 325 | 326 | if (fread(value, 1, vlen, input) != vlen) 327 | goto fail; 328 | 329 | const cdb_buffer_t kb = { .length = klen, .buffer = key }; 330 | const cdb_buffer_t vb = { .length = vlen, .buffer = value }; 331 | 332 | if (cdb_add(cdb, &kb, &vb) < 0) { 333 | (void)fprintf(stderr, "cdb file add failed\n"); 334 | goto fail; 335 | } 336 | const int ch1 = fgetc(input); 337 | if (ch1 == '\n') 338 | continue; 339 | if (ch1 == EOF) 340 | goto end; 341 | if (ch1 != '\r') 342 | goto fail; 343 | if ('\n' != fgetc(input)) 344 | goto fail; 345 | } 346 | fail: 347 | r = -1; 348 | end: 349 | free(key); 350 | free(value); 351 | return r; 352 | } 353 | 354 | static int cdb_stats(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) { 355 | assert(cdb); 356 | assert(key); 357 | assert(value); 358 | assert(param); 359 | UNUSED(cdb); 360 | cdb_statistics_t *cs = param; 361 | cs->records++; 362 | cs->total_key_length += key->length; 363 | cs->total_value_length += value->length; 364 | cs->min_key_length = MIN(cs->min_key_length, key->length); 365 | cs->min_value_length = MIN(cs->min_value_length, value->length); 366 | cs->max_key_length = MAX(cs->max_key_length, key->length); 367 | cs->max_value_length = MAX(cs->max_value_length, value->length); 368 | return 0; 369 | } 370 | 371 | static int cdb_stats_print(cdb_t *cdb, FILE *output, int verbose, size_t bytes) { 372 | assert(cdb); 373 | assert(output); 374 | unsigned long distances[DISTMAX] = { 0, }; 375 | unsigned long entries = 0, occupied = 0, collisions = 0, hmin = ULONG_MAX, hmax = 0; 376 | double avg_key_length = 0, avg_value_length = 0, avg_hash_length = 0; 377 | cdb_statistics_t s = { 378 | .records = 0, 379 | .min_key_length = ULONG_MAX, 380 | .min_value_length = ULONG_MAX, 381 | }; 382 | 383 | if (cdb_foreach(cdb, cdb_stats, &s) < 0) 384 | return -1; 385 | 386 | if (verbose) 387 | if (fputs("Initial hash table:\n", output) < 0) 388 | return -1; 389 | 390 | for (size_t i = 0; i < 256; i++) { 391 | if (cdb_seek(cdb, i * (2ull * bytes)) < 0) 392 | return -1; 393 | cdb_word_t pos = 0, num = 0; 394 | if (cdb_read_word_pair(cdb, &pos, &num) < 0) 395 | return -1; 396 | if (verbose) { 397 | if ((i % 4) == 0) 398 | if (fprintf(output, "\n%3d:\t", (int)i) < 0) 399 | return -1; 400 | if (fprintf(output, "$%4lx %3ld, ", (long)pos, (long)num) < 0) 401 | return -1; 402 | } 403 | 404 | collisions += num > 2ul; 405 | entries += num; 406 | occupied += num != 0; 407 | hmax = MAX(num, hmax); 408 | if (num) 409 | hmin = MIN(num, hmin); 410 | if (cdb_seek(cdb, pos) < 0) 411 | return -1; 412 | for (size_t j = 0; j < num; j++) { 413 | cdb_word_t h = 0, p = 0; 414 | if (cdb_read_word_pair(cdb, &h, &p) < 0) 415 | return -1; 416 | if (!p) 417 | continue; 418 | h = (h >> 8) % num; 419 | if (h == j) { 420 | h = 0; 421 | } else { 422 | h = h < j ? j - h : num - h + j; 423 | h = MIN(h, DISTMAX - 1ul); 424 | } 425 | distances[h]++; 426 | } 427 | } 428 | 429 | if (verbose) 430 | if (fputs("\n\n", output) < 0) 431 | return -1; 432 | 433 | if (s.records == 0) { 434 | s.min_key_length = 0; 435 | s.min_value_length = 0; 436 | hmin = 0; 437 | } else { 438 | avg_key_length = (double)s.total_key_length / (double) s.records; 439 | avg_value_length = (double)s.total_value_length / (double) s.records; 440 | avg_hash_length = (double)entries / (double)occupied; 441 | } 442 | 443 | if (fprintf(output, "records:\t\t\t%lu\n", s.records) < 0) 444 | return -1; 445 | if (fprintf(output, "key min/max/avg/bytes:\t%lu/%lu/%g/%lu\n", 446 | s.min_key_length, s.max_key_length, avg_key_length, s.total_key_length) < 0) 447 | return -1; 448 | if (fprintf(output, "value min/max/avg/bytes:\t%lu/%lu/%g/%lu\n", 449 | s.min_value_length, s.max_value_length, avg_value_length, s.total_value_length) < 0) 450 | return -1; 451 | if (fprintf(output, "top hash table used/entries/collisions:\t%lu/%lu/%lu\n", occupied, entries, collisions) < 0) 452 | return -1; 453 | if (fprintf(output, "hash tables min/avg/max:\t%lu/%g/%lu\n", hmin, avg_hash_length, hmax) < 0) 454 | return -1; 455 | if (fprintf(output, "hash tables collisions/buckets:\t%lu/%lu\n", s.records - distances[0], entries) < 0) 456 | return -1; 457 | if (fputs("hash table distances:\n", output) < 0) 458 | return -1; 459 | 460 | for (size_t i = 0; i < DISTMAX; i++) { 461 | const double pct = s.records ? ((double)distances[i] / (double)s.records) * 100.0 : 0.0; 462 | if (fprintf(output, "\td%u%s %4lu %5.2g%%\n", (unsigned)i, i == DISTMAX - 1ul ? "+:" : ": ", distances[i], pct) < 0) 463 | return -1; 464 | } 465 | return 0; 466 | } 467 | 468 | static int cdb_query(cdb_t *cdb, char *key, int record, FILE *output) { 469 | assert(cdb); 470 | assert(key); 471 | assert(output); 472 | const cdb_buffer_t kb = { .length = strlen(key), .buffer = key }; 473 | cdb_file_pos_t vp = { 0, 0, }; 474 | const int gr = cdb_lookup(cdb, &kb, &vp, record); 475 | if (gr < 0) 476 | return -1; 477 | if (gr > 0) /* found */ 478 | return cdb_print(cdb, &vp, output) < 0 ? -1 : 0; 479 | return 2; /* not found */ 480 | } 481 | 482 | /* We should output directly to a database as well... */ 483 | static int generate(FILE *output, unsigned long records, unsigned long min, unsigned long max, unsigned long seed) { 484 | assert(output); 485 | uint64_t s[2] = { seed, 0, }; 486 | if (max == 0) 487 | max = 1024; 488 | if (min > max) 489 | min = max; 490 | if ((max + min) > max) 491 | return -1; 492 | for (uint64_t i = 0; i < records; i++) { 493 | const unsigned long kl = (cdb_prng(s) % (max + min)) + min; /* adds bias but so what fight me */ 494 | const unsigned long vl = (cdb_prng(s) % (max + min)) + min; 495 | if (fprintf(output, "+%lu,%lu:", kl, vl) < 0) 496 | return -1; 497 | for (unsigned long j = 0; j < kl; j++) 498 | if (fputc('a' + (cdb_prng(s) % 26), output) < 0) 499 | return -1; 500 | if (fputs("->", output) < 0) 501 | return -1; 502 | for (unsigned long j = 0; j < vl; j++) 503 | if (fputc('a' + (cdb_prng(s) % 26), output) < 0) 504 | return -1; 505 | if (fputc('\n', output) < 0) 506 | return -1; 507 | } 508 | if (fputc('\n', output) < 0) 509 | return -1; 510 | return 0; 511 | } 512 | 513 | static int hasher(FILE *input, FILE *output) { /* should really input keys in "+length:key\n" format */ 514 | assert(input); 515 | assert(output); 516 | char line[512] = { 0, }; /* long enough for everyone right? */ 517 | for (; fgets(line, sizeof line, input); line[0] = 0) { 518 | size_t l = strlen(line); 519 | if (l && line[l-1] == '\n') 520 | line[l--] = 0; 521 | if (fprintf(output, "0x%08lx\n", (unsigned long)cdb_hash((uint8_t*)line, l)) < 0) 522 | return -1; 523 | } 524 | return 0; 525 | } 526 | 527 | static int help(FILE *output, const char *arg0) { 528 | assert(output); 529 | assert(arg0); 530 | unsigned long version = 0; 531 | if (cdb_version(&version) < 0) 532 | info("version not set - built incorrectly"); 533 | const unsigned q = (version >> 24) & 0xff; 534 | const unsigned x = (version >> 16) & 0xff; 535 | const unsigned y = (version >> 8) & 0xff; 536 | const unsigned z = (version >> 0) & 0xff; 537 | static const char *usage = "\ 538 | Usage : %s -hv *OR* -[rcdkstVT] file.cdb *OR* -q file.cdb key [record#] *OR* -g *OR* -H\n\ 539 | Program : Constant Database Driver (clone of https://cr.yp.to/cdb.html)\n\ 540 | Author : " CDB_AUTHOR "\n\ 541 | Email : " CDB_EMAIL "\n\ 542 | Repo : " CDB_REPO "\n\ 543 | License : " CDB_LICENSE "\n\ 544 | Version : %u.%u.%u\n\ 545 | Options : 0x%x\n\ 546 | Size : %d\n\ 547 | Notes : See manual pages or project website for more information.\n\n\ 548 | Options :\n\n\ 549 | \t-h : print this help message and exit successfully\n\ 550 | \t-v : increase verbosity level\n\ 551 | \t-c file.cdb : create a new database reading keys from stdin\n\ 552 | \t-d file.cdb : dump entire database\n\ 553 | \t-k file.cdb : dump all keys (there may be duplicates)\n\ 554 | \t-s file.cdb : calculate database statistics\n\ 555 | \t-t file.cdb : run internal tests generating a test file\n\ 556 | \t-T temp.cdb : name of temporary file to use\n\ 557 | \t-V file.cdb : validate database\n\ 558 | \t-q file.cdb key #? : run query for key with optional record number\n\ 559 | \t-b size : database size (valid sizes = 16, 32 (default), 64)\n\ 560 | \t-o number : specify offset into file where database begins\n\ 561 | \t-H : hash keys and output their hash\n\ 562 | \t-g : spit out an example database *dump* to standard out\n\ 563 | \t-m number : set minimum length of generated record\n\ 564 | \t-M number : set maximum length of generated record\n\ 565 | \t-R number : set number of generated records\n\ 566 | \t-S number : set seed for record generation\n\n\ 567 | In create mode the key input format is:\n\n\ 568 | \t+key-length,value-length:key->value\n\n\ 569 | An example:\n\n\ 570 | \t+5,5:hello->world\n\n\ 571 | Queries are in a similar format:\n\n\ 572 | \t+key-length:key\n\n\ 573 | Binary key/values are allowed, as are duplicate and empty keys/values.\n\ 574 | Returns values of 0 indicate success/found, 2 not found, and anything else\n\ 575 | indicates an error.\n\ 576 | "; 577 | return fprintf(output, usage, arg0, x, y, z, q,(int)(sizeof (cdb_word_t) * CHAR_BIT)); 578 | } 579 | 580 | int main(int argc, char **argv) { 581 | enum { QUERY, DUMP, CREATE, STATS, KEYS, VALIDATE, GENERATE, }; 582 | const char *file = NULL; 583 | char *tmp = NULL; 584 | int mode = VALIDATE, creating = 0; 585 | unsigned long min = 0ul, max = 1024ul, records = 1024ul, seed = 0ul; 586 | 587 | binary(stdin); 588 | binary(stdout); 589 | binary(stderr); 590 | 591 | char ibuf[BUFSIZ], obuf[BUFSIZ]; /* NOT INITIALIZED */ 592 | if (setvbuf(stdin, ibuf, _IOFBF, sizeof ibuf) < 0) 593 | return -1; 594 | if (setvbuf(stdout, obuf, _IOFBF, sizeof obuf) < 0) 595 | return -1; 596 | 597 | cdb_options_t ops = cdb_host_options; 598 | 599 | cdb_getopt_t opt = { .init = 0 }; 600 | for (int ch = 0; (ch = cdb_getopt(&opt, argc, argv, "hHgvt:c:d:k:s:q:V:b:T:m:M:R:S:o:G:")) != -1; ) { 601 | switch (ch) { 602 | case 'h': return help(stdout, argv[0]), 0; 603 | case 'H': return hasher(stdin, stdout); 604 | case 't': return -cdb_tests(&ops, opt.arg); 605 | case 'v': verbose++; break; 606 | case 'c': file = opt.arg; mode = CREATE; break; 607 | case 'd': file = opt.arg; mode = DUMP; break; 608 | case 'k': file = opt.arg; mode = KEYS; break; 609 | case 's': file = opt.arg; mode = STATS; break; 610 | case 'q': file = opt.arg; mode = QUERY; break; 611 | case 'V': file = opt.arg; mode = VALIDATE; break; 612 | case 'g': mode = GENERATE; break; 613 | case 'T': assert(opt.arg); tmp = opt.arg; break; 614 | case 'b': assert(opt.arg); ops.size = atol(opt.arg); break; 615 | case 'm': assert(opt.arg); min = atol(opt.arg); break; 616 | case 'M': assert(opt.arg); max = atol(opt.arg); break; 617 | case 'R': assert(opt.arg); records = atol(opt.arg); break; 618 | case 'S': assert(opt.arg); seed = atol(opt.arg); break; 619 | case 'o': assert(opt.arg); ops.offset = atol(opt.arg); break; 620 | default: help(stderr, argv[0]); return 1; 621 | } 622 | } 623 | 624 | /* N.B. We could also generate a CDB file directly as well, 625 | * instead of generating a dump, the "generate" function 626 | * would need a rewrite though */ 627 | if (mode == GENERATE) { 628 | int r = generate(stdout, records, min, max, seed); 629 | /* Valgrind reports errors (on my setup) when writing to 630 | * stdout and not flushing, the flush is called in the exit 631 | * code and causes an error even though nothing *seems* 632 | * incorrect. */ 633 | if (fflush(stdout) < 0) 634 | r = -1; 635 | return r < 0 ? 1 : 0; 636 | } 637 | 638 | /* For many of the modes "file" could be "stdout", this works 639 | * for everything bar CREATE mode which will need to seek on 640 | * its output. */ 641 | if (!file) 642 | return help(stderr, argv[0]), 1; 643 | 644 | creating = mode == CREATE; 645 | 646 | cdb_t *cdb = NULL; 647 | const char *name = creating && tmp ? tmp : file; 648 | info("opening '%s' for %s", name, creating ? "writing" : "reading"); 649 | const int etmp = errno; 650 | errno = 0; 651 | if (cdb_open(&cdb, &ops, creating, name) < 0) { 652 | const char *f = errno ? strerror(errno) : "unknown"; 653 | const char *m = creating ? "create" : "read"; 654 | die("opening file '%s' in %s mode failed: %s", name, m, f); 655 | } 656 | errno = etmp; 657 | 658 | int r = 0; 659 | switch (mode) { 660 | case CREATE: r = cdb_create(cdb, stdin); break; 661 | case DUMP: r = cdb_foreach(cdb, cdb_dump, stdout); if (fputc('\n', stdout) < 0) r = -1; break; 662 | case KEYS: r = cdb_foreach(cdb, cdb_dump_keys, stdout); if (fputc('\n', stdout) < 0) r = -1; break; 663 | case STATS: r = cdb_stats_print(cdb, stdout, 0, ops.size / 8ul); break; 664 | case VALIDATE: r = cdb_foreach(cdb, NULL, NULL); break; 665 | case QUERY: { 666 | if (opt.index >= argc) 667 | die("-q opt requires key (and optional record number)"); 668 | char *key = argv[opt.index++]; 669 | r = cdb_query(cdb, key, opt.index < argc ? atoi(argv[opt.index++]) : 0, stdout); 670 | break; 671 | } 672 | default: 673 | die("unimplemented mode: %d", mode); 674 | } 675 | if (fflush(stdout) < 0) 676 | r = -1; 677 | 678 | const int cdbe = cdb_status(cdb); 679 | if (cdb_close(cdb) < 0) 680 | die("close failed: %d", cdbe); 681 | if (cdbe < 0) 682 | die("cdb internal error: %d", cdbe); 683 | 684 | if (creating && tmp) { 685 | info("renaming temporary file"); 686 | if (rename(tmp, file) < 0) 687 | die("rename from '%s' to '%s' failed: %s", tmp, file, strerror(errno)); 688 | } 689 | return r < 0 ? 1 : 0; 690 | } 691 | 692 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | # CDB makefile - default target should build everything 2 | # 3 | VERSION =0x080000ul 4 | CFLAGS =-Wall -Wextra -fPIC -std=c99 -O3 -pedantic -fwrapv -Wmissing-prototypes -DCDB_VERSION="${VERSION}" ${DEFINES} ${EXTRA} 5 | TARGET =cdb 6 | AR =ar 7 | ARFLAGS =rcs 8 | RANLIB =ranlib 9 | DESTDIR =install 10 | 11 | ifeq ($(OS),Windows_NT) 12 | DLL=dll 13 | else # Assume Unixen 14 | DLL=so 15 | CFLAGS+=-D_FILE_OFFSET_BITS=64 16 | endif 17 | 18 | .PHONY: all test clean dist install 19 | 20 | all: ${TARGET} 21 | 22 | cdb.o: cdb.c cdb.h makefile 23 | 24 | host.o: host.c host.h cdb.h makefile 25 | 26 | main.o: main.c host.o cdb.h makefile 27 | 28 | lib${TARGET}.a: ${TARGET}.o ${TARGET}.h 29 | ${AR} ${ARFLAGS} $@ $< 30 | ${RANLIB} $@ 31 | 32 | lib${TARGET}.${DLL}: ${TARGET}.o ${TARGET}.h 33 | ${CC} ${CFLAGS} -shared ${TARGET}.o -o $@ 34 | 35 | ${TARGET}: main.o host.o lib${TARGET}.a 36 | ${CC} $^ -o $@ 37 | -strip ${TARGET} 38 | 39 | test.cdb: ${TARGET} 40 | ./${TARGET} -t test.cdb 41 | 42 | test: test.cdb 43 | 44 | ${TARGET}.1: readme.md 45 | -pandoc -s -f markdown -t man $< -o $@ 46 | 47 | .git: 48 | git clone https://github.com/howerj/cdb cdb-repo 49 | mv cdb-repo/.git . 50 | rm -rf cdb-repo 51 | 52 | install: ${TARGET} lib${TARGET}.a lib${TARGET}.${DLL} ${TARGET}.1 .git 53 | install -p -D ${TARGET} ${DESTDIR}/bin/${TARGET} 54 | install -p -m 644 -D lib${TARGET}.a ${DESTDIR}/lib/lib${TARGET}.a 55 | install -p -D lib${TARGET}.${DLL} ${DESTDIR}/lib/lib${TARGET}.${DLL} 56 | install -p -m 644 -D ${TARGET}.h ${DESTDIR}/include/${TARGET}.h 57 | -install -p -m 644 -D ${TARGET}.1 ${DESTDIR}/man/${TARGET}.1 58 | mkdir -p ${DESTDIR}/src 59 | cp -a .git ${DESTDIR}/src 60 | cd ${DESTDIR}/src && git reset --hard HEAD 61 | 62 | dist: install 63 | tar zcf ${TARGET}-${VERSION}.tgz ${DESTDIR} 64 | 65 | clean: .git 66 | git clean -dffx 67 | 68 | 69 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | % cdb(1) | Constant Database 2 | 3 | # NAME 4 | 5 | CDB - An interface to the Constant Database Library 6 | 7 | # SYNOPSES 8 | 9 | cdb -h 10 | 11 | cdb -\[cdkstVG\] file.cdb 12 | 13 | cdb -q file.cdb key \[record#\] 14 | 15 | cdb -g -M minimum -M maximum -R records -S seed 16 | 17 | cdb -H 18 | 19 | # DESCRIPTION 20 | 21 | Author: Richard James Howe 22 | License: Unlicense 23 | Repository: 24 | Email: howe.r.j.89@gmail.com 25 | 26 | A clone of the [CDB][] database, a simple, read-only (once created) database. 27 | The database library is designed so it can be embedded into a microcontroller 28 | if needed. This program can be used for creating and querying CDB databases, 29 | which consist of key-value pairs of binary data. 30 | 31 | This program also includes several options that help in testing out the 32 | database, one for hashing input keys and printing the hash for the default hash 33 | function and another one for generating a database with (Pseudo-)random keys 34 | and values of a given length. 35 | 36 | **This library can create 16, 32 and 64 bit versions of the CDB file format 37 | removing one of the major limitations of the 32-bit version.** 38 | 39 | **The 64-bit version of the database uses a different hash than djb2**. 40 | 41 | # OPTIONS 42 | 43 | **-h** : print out this help message and exit successfully 44 | 45 | **-b** : set the size of the CDB database to use (default is 32, can be 16 or 64) 46 | 47 | **-v**: increase verbosity level 48 | 49 | **-t** *file.cdb* : run internal tests, exit with zero on a pass 50 | 51 | **-c** *file.cdb* : run in create mode 52 | 53 | **-d** *file.cdb* : dump the database 54 | 55 | **-k** *file.cdb* : dump the keys in the database 56 | 57 | **-s** *file.cdb* : print statistics about the database 58 | 59 | **-T** *temp.cdb* : name of temporary file to use 60 | 61 | **-V** *file.cdb* : validate database 62 | 63 | **-q** *file.cdb key record-number* : query the database for a key, with an optional record 64 | 65 | **-o** number : specify offset into file where database begins 66 | 67 | **-H** : hash keys and output their hash 68 | 69 | **-g** : spit out an example database to standard out 70 | 71 | **-m** number : set minimum length of generated record 72 | 73 | **-M** number : set maximum length of generated record 74 | 75 | **-R** number : set number of generated records 76 | 77 | **-S** number : set seed for record generation 78 | 79 | # EXAMPLES 80 | 81 | Creating a database, called 'example.cdb': 82 | 83 | $ ./cdb -c example.cdb 84 | +0,1:->X 85 | +1,0:Y-> 86 | +1,1:a->b 87 | +1,1:a->b 88 | +1,2:a->ba 89 | +5,5:hello->world 90 | 91 | Note that zero length keys and values are valid, and that duplicate keys are 92 | allowed, even keys with the same value. A key with the specified value is 93 | created for each duplicate, just like a non-duplicate key. 94 | 95 | Looking up values in the created database: 96 | 97 | ./cdb -q example.cdb "" 98 | ./cdb -q example.cdb Y 99 | ./cdb -q example.cdb a 100 | ./cdb -q example.cdb a 0 101 | ./cdb -q example.cdb a 1 102 | ./cdb -q example.cdb a 2 103 | ./cdb -q example.cdb hello 104 | 105 | Dumping a database: 106 | 107 | $ ./cdb -d example.cdb 108 | 109 | A database dump can be read straight back in to create another database: 110 | 111 | $ ./cdb -d example.cdb | ./cdb -c should_have_just_used_copy.cdb 112 | 113 | Which is not useful in itself, but *assuming* your data (both keys and 114 | values) is ASCII text with no new lines and NUL characters then you could 115 | filter out, modify or add in values with the standard Unix command line 116 | tools. 117 | 118 | # RETURN VALUE 119 | 120 | cdb returns zero on success/key found, and a non zero value on failure. Two is 121 | returned if a key is not found, any other value indicates a more serious 122 | failure. 123 | 124 | # LIMITATIONS 125 | 126 | Three different versions of the library can be built; a 16, a 32 and a 64 bit 127 | version. The 32 bit version is the default version. For all versions there is a 128 | limit on the maximum file size in the format used of 2^N, where N is the size. 129 | Keys and Values have the same limit (although they can never reach that size as 130 | some of the overhead is taken up as part of the file format). Any other 131 | arbitrary limitation is a bug in the implementation. 132 | 133 | The minimum size of a CDB file is 256 \* 2 \* (N/8) bytes. 134 | 135 | It should be noted that if you build a N bit (where N is 16, 32 or 64) 136 | version of this library you are limited to creating databases that are the 137 | size of N and less, e.g. If `cdb_word_t` is set to `uint32_t`, and therefore 138 | the 32-bit version of this library is being built, then you can create 32-bit 139 | and 16-bit versions of the CDB database format, but you cannot make 64-bit 140 | versions. You can set `cdb_word_t` to `uint64_t` (which enables the library 141 | to create all three mutually incompatible versions of the library) on a 142 | 32-bit system, naturally. 143 | 144 | # INPUT/DUMP FORMAT 145 | 146 | The input and dump format follow the same pattern, some ASCII text specifying 147 | the beginning of a record and then some binary data with some separators, and 148 | a newline terminating the record, the format is: 149 | 150 | +key-length,value-length:KEY->VALUE 151 | +key-length,value-length:KEY->VALUE 152 | ... 153 | +key-length,value-length:KEY->VALUE 154 | 155 | Despite the presence of textual data, the input key and value can contain 156 | binary data, including the ASCII NUL character. 157 | 158 | An example, encoding the key value pair "abc" to "def" and "G" to "hello": 159 | 160 | +3,3:abc->def 161 | +1,5:G->hello 162 | 163 | The following [awk][] script can be used to pre-process a series of key-value 164 | pairs in the format "key value", with one record per line and optional comment 165 | lines: 166 | 167 | #!/bin/sh 168 | LC_ALL='C' awk ' 169 | /^[^#]/ { 170 | print "+" length($1) "," length($2) ":" $1 "->" $2 171 | } 172 | END { 173 | print "" 174 | } 175 | ' | cdb -c "$@" 176 | 177 | Which was available in the original [original cdb][] program as 'cdbmake-12'. 178 | 179 | # FILE FORMAT 180 | 181 | The file format is incredibly simple, it is designed so that only the header 182 | and the hash table pointer need to be stored in memory during generation of the 183 | table - the keys and values can be streamed on to the disk. The header consists 184 | of 256 2-word values forming an initial hash table that point to the hash 185 | tables at the end of the file, the key-value records, and then up to 256 hash 186 | tables pointing to the key-value pairs. 187 | 188 | A word consists of a 4-byte/32-bit value (although this may be changed via 189 | compile time options, creating an incompatible format). All word values are 190 | stored in little-endian format. 191 | 192 | The initial hash table contains an array of 256 2-word values. 193 | The words are; a position of a hash table in the file and the number of buckets 194 | in that hash table, stored in that order. To lookup a key the key is first 195 | hashed, the lowest eight bits of the hash are used to index into the initial table 196 | and if there are values in this hash the search then proceeds to the second hash 197 | table at the end of the file. 198 | 199 | The hash tables at the end of the file contains an array of two word records, 200 | containing the full hash and a file position of the key-value pair. To search 201 | for a key in this table the hash of the key is taken and the lowest eight bits 202 | are discarded by shifting right eight places, the hash is then taken modulo the 203 | number of elements in the hash table, the resulting value is used as an initial 204 | index into the hash table. Searching continues until the key is found, or an 205 | empty record is found, or the number of records in the table have been searched 206 | through with no match. A key is compared by looking at the hash table records, 207 | if the hash of the key matches the stored hash in the hash table records then a 208 | possible match is found, the file position is then used to look up the 209 | key-value pair and the key is compared. 210 | 211 | The number of buckets in the hash table is chosen as twice the number of 212 | populated entries in the hash table. 213 | 214 | A key-value pair is stored as two words containing the key length and the value 215 | length in that order, then the key, and finally the value. 216 | 217 | The hashing algorithm used is similar to [djb2][] (except for the 64-bit 218 | version, which uses a 64-bit variant of SDBM hash), but with a minor modification that 219 | an exclusive-or replaces an addition. 220 | 221 | The algorithm calculates hashes of the size of a word, the initial hash value is the special 222 | number '5381'. The hash is calculated as the current hash value multiplied by 33, to which the 223 | new byte to be hashes and the result of multiplication under go an exclusive-or 224 | operation. This repeats until all bytes to be hashed are processed. All 225 | arithmetic operations are unsigned and performed modulo 2 raised to the power 226 | of 32. 227 | 228 | The pseudo code for this is: 229 | 230 | set HASH to 5381 231 | for each OCTET in INPUT: 232 | set HASH to: ((HASH * 33) % pow(2, 32)) xor OCTET 233 | return HASH 234 | 235 | Note that there is nothing in the file format that disallows duplicate keys in 236 | the database, in fact the API allows duplicate keys to be retrieved. Both key 237 | and data values can also be zero bytes long. There are also no special 238 | alignment requirements on the data. 239 | 240 | The best documentation on the file format is a small pure python script that 241 | implements a set of functions for manipulating a CDB database, a description is 242 | available here and the 243 | script itself is available at the bottom of that page 244 | . 245 | 246 | A visualization of the overall file structure: 247 | 248 | Constant Database Sections 249 | .-------------------------------------------. 250 | | 256 Bucket Initial Hash Table (2KiB) | 251 | .-------------------------------------------. 252 | | Key Value Pairs | 253 | .-------------------------------------------. 254 | | 0-256 Secondary Hash Tables | 255 | .-------------------------------------------. 256 | 257 | The initial hash table at the start of the file: 258 | 259 | 256 Bucket Initial Hash Table (2KiB) 260 | .-------------------------------------------. 261 | | { P, L } | { P, L } | { P, L } | ... | 262 | .----------+----------+----------+----------. 263 | | ... | { P, L } | { P, L } | { P, L } | 264 | .-------------------------------------------. 265 | P = Position of secondary hash table 266 | L = Number of buckets in secondary hash table 267 | 268 | The key-value pairs: 269 | 270 | .-------------------------------------------. 271 | | { KL, VL } | KEY ... | VALUE ... | 272 | .-------------------------------------------. 273 | KL = Key Length 274 | VL = Value Length 275 | KEY = Varible length binary data key 276 | VALUE = Variable length binary value 277 | 278 | Of the variable number of hash tables (which each are of a variable length) at 279 | the end of the file: 280 | 281 | 0-256 Variable Length Secondary Hash Tables 282 | .---------------------. 283 | | { H, P } | { H, P } | 284 | .----------+----------+---------------------. 285 | | { H, P } | ... | ... | { H, P } | 286 | .----------+----------+----------+----------. 287 | | { H, P } | ... | { H, P } | 288 | .--------------------------------. 289 | H = Hash 290 | P = Position of Key-Value Pair 291 | 292 | And that is all for the file format description. 293 | 294 | While the keys-value pairs can be streamed to disk and the second level hash 295 | table written after those keys, anything that creates a database will have 296 | to seek to the beginning of the file to rewrite the header, this could have 297 | been avoided by storing the 256 initial hash table results at the end of 298 | the file allowing a database to be constructed in a Unix filter, but alas, 299 | this is not possible. Also of note, by passing in a custom hash algorithm to 300 | the C API you have much more control over where each of the key-value pairs 301 | get stored, specifically, which bucket they will end up in by controlling 302 | the lowest 8-bits (for example you could set the lowest 8-bits to the first 303 | byte in the key in a custom hash). 304 | 305 | Note that there is nothing stopping you storing the key-value pairs in 306 | some kind of order, you could do this by adding the keys in lexicographic 307 | order for a database sorted by key. Retrieving keys using the C function 308 | "cdb\_foreach" would allow you retrieve keys in order. The hash table itself 309 | would remain unaware of this order. Dumping the key-value pairs would maintain 310 | this order as well. There is no guarantee other tools will preserve this 311 | order however (they may dump key-value pairs backwards, or by going through 312 | the hash table). 313 | 314 | # CDB C API OVERVIEW 315 | 316 | There are a few goals that the API has: 317 | 318 | * Simplicity, there should be few functions and data structures. 319 | * The API is easy to use. 320 | * There should be minimal dependencies on the C standard library. The 321 | library itself should be small and not be a huge, non-portable, "optimized", 322 | mess. 323 | * The user should decide when, where and how allocations are performed. The 324 | working set that is allocated should be small. 325 | * The database driver should catch corrupt files if possible. 326 | 327 | Some of these goals are in conflict, being able to control allocations and 328 | having minimal dependencies allow the library to be used in an embedded system, 329 | however it means that in order to do very basic things the user has to 330 | provide a series of callbacks. The callbacks are simple to implement on a 331 | hosted system, examples are provided in [main.c][] and [host.c][] in the 332 | project repository, but this means the library is not just read to use. 333 | 334 | There are two sets of operations that most users will want to perform; creating 335 | a database and reading keys. After the callbacks have been provided, to create 336 | a database requires opening up a new database in create mode: 337 | 338 | /* error handling omitted for brevity */ 339 | cdb_t *cdb = NULL; 340 | cdb_options_t ops = { /* Your file callbacks/options go here */ }; 341 | cdb_open(&cdb, &ops, 1, "example.cdb"); 342 | cdb_buffer_t key = { .length = 5, .buffer = "hello", }; 343 | cdb_buffer_t value = { .length = 5, .buffer = "world", }; 344 | cdb_add(cdb, &key, &value); 345 | cdb_close(cdb); 346 | 347 | If you are dealing with mostly NUL terminated ASCII/UTF-8 strings it is worth 348 | creating a function to deal with them: 349 | 350 | int cdb_add_string(cdb_t *cdb, const char *key, const char *value) { 351 | assert(cdb); 352 | assert(key); 353 | assert(value); 354 | const cdb_buffer_t k = { .length = strlen(key), .buffer = (char*)key, }; 355 | const cdb_buffer_t v = { .length = strlen(value), .buffer = (char*)value, }; 356 | return cdb_add(cdb, &k, &v); 357 | } 358 | 359 | Note that you *cannot* query for a key from a database opened up in create 360 | mode and you *cannot* add a key-value pair to a database opened up in read 361 | mode. The operations are mutually exclusive. 362 | 363 | To search for a key within the database, you open up a database connection in 364 | read mode (create = 0): 365 | 366 | /* error handling omitted for brevity */ 367 | cdb_t *cdb = NULL; 368 | cdb_options_t ops = { /* Your file callbacks/options go here */ }; 369 | cdb_open(&cdb, &ops, 1, "example.cdb"); 370 | cdb_buffer_t key = { .length = 5, .buffer = "hello" }; 371 | cdb_file_pos_t value = { 0, 0, }; 372 | cdb_get(cdb, &key, &value); 373 | /* use cdb_seek, then cdb_read, to use returned value */ 374 | cdb_close(cdb); 375 | 376 | Upon retrieval of a key the database does not allocate a value for you, instead 377 | it provides an object consisting of a file position and a length of the value. 378 | This can be read from wherever the database is stored with the function 379 | 'cdb\_read'. Before issuing a read, 'cdb\_seek' *must* be called as the file 380 | handle may be pointing to a different area in the database. 381 | 382 | If a read or a seek is issued that goes outside of the bounds of the database 383 | then all subsequent database operations on that handle will fail, not just 384 | reads or seeks. The only valid things to do on a database that has returned a 385 | negative number is to call 'cdb\_status' and then 'cdb\_close' and never 386 | use the handle again. 'cdb\_status' must not be used on a closed handle. 387 | 388 | As there are potentially duplicate keys, the function 'cdb\_count' can be 389 | used to query for duplicates. It sets the parameter count to the number of 390 | records found for that key (and it sets count to zero, and returns zero, if no 391 | keys are found, it returns one if one or more keys were found). 392 | 393 | The function 'cdb\_status' can be used to query what error has occurred, if 394 | any. On an error a negative value is returned, the meaning of this value is 395 | deliberately not included in the header as the errors recorded and the 396 | meaning of their values may change. Use the source for the library to determine 397 | what error occurred. 398 | 399 | The function 'cdb\_version' returns the version number in an out parameter 400 | and information about the compile time options selected when the library was built. 401 | A [Semantic Version Number][] is used, which takes the form "MAJOR.MINOR.PATCH". 402 | The PATCH number is stored in the Least Significant Byte, the MINOR number the 403 | next byte up, and the MAJOR in the third byte. The fourth byte contains the 404 | compile time options. 405 | 406 | There are several things that could be done to speed up the database but this 407 | would complicate the implementation and the API. 408 | 409 | ## C API FUNCTIONS 410 | 411 | The C API contains 13 functions and some callbacks, more than is 412 | desired, but they all have their uses. Ideally a library would 413 | contain far fewer functions and require less of a cognitive burden 414 | on the user to get right, however making a generic enough C library 415 | and using C in general requires more complexity than is usual, but 416 | not more than is necessary. 417 | 418 | There is regularity in these functions, they all return negative 419 | on failure (the only exception being the allocator callback that 420 | returns a pointer), most of the functions accept a "cdb\_t" structure 421 | as well, which is an [opaque pointer][] (opaque pointers are not 422 | an unalloyed good, they imply that an allocator must be used, which 423 | can be a problem in embedded systems). 424 | 425 | int cdb_open(cdb_t **cdb, const cdb_options_t *ops, int create, const char *file); 426 | int cdb_close(cdb_t *cdb); 427 | int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length); 428 | int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value); 429 | int cdb_seek(cdb_t *cdb, cdb_word_t position); 430 | int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param); 431 | int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2); 432 | int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value); 433 | int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, long record); 434 | int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, long *count); 435 | int cdb_status(cdb_t *cdb); 436 | int cdb_version(unsigned long *version); 437 | int cdb_tests(const cdb_options_t *ops, const char *test_file); 438 | 439 | typedef int (*cdb_callback)(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param); 440 | 441 | * cdb\_open 442 | 443 | The most complex function that contains the most parameters, "cdb\_open" 444 | is used to open a connection to a database. A pointer to a handle is 445 | passed to the first parameter, using the supplied allocation callback 446 | (passed-in in the "ops" parameter) the function will allocate enough space 447 | for "cdb\_t" structure, this out-parameter is the database handle. It will 448 | be set to NULL on failure, which will also be indicated with a negative 449 | return value on the "cdb\_open" function. Once "cdb\_close" is called on 450 | this handle the handle *should not* be used again, and "cdb\_close" should 451 | only be called on the returned handle *once*. 452 | 453 | A single database can be opened by as many readers as you like, however 454 | reading a database and writing to a database are mutually exclusive operations. 455 | 456 | When writing to a database there *should not* be any readers active on 457 | that database. This is a fundamental limitation of the database design. 458 | 459 | Writing to a CDB file that is being read by another CDB instance can 460 | cause corruption of data and general nasty things! Do not do it! 461 | 462 | As such, a database can only be opened up in read only, or write only 463 | mode. 464 | 465 | The "file" parameter is passed to the "open" callback, which is present 466 | in the "ops" parameter. 467 | 468 | void *(*open)(const char *name, int mode); 469 | 470 | The callback should return an opaque pointer on success and NULL on failure. 471 | It is used to open up a handle to the database via whatever method the 472 | library user would like (for example, a simple file present in your file 473 | system, or a section of flash in an embedded computer). The open callback 474 | is used by "cdb\_open" and should not be called directly. 475 | 476 | The "mode" parameter to the "open" callback will be set to "CDB\_RW\_MODE" if 477 | "create" is non-zero, and will be set to "CDB\_RO\_MODE" if it is zero. 478 | 479 | CDB\_RW\_MODE is an enumeration that has the value "1", whilst 480 | CDB\_RW\_MODE has the value "0". 481 | 482 | "cdb\_open" does quite a lot, when opening a CDB file for reading the 483 | file is *partially* verified, when opening for writing a blank first level 484 | hash table is written to disk. If either of this fails, then opening 485 | the database will fail. 486 | 487 | The function also needs the callbacks to perform a seek to be present, 488 | along with the callback for reading. The write callback only needs to 489 | present when the database is opened up in write mode. 490 | 491 | * cdb\_close 492 | 493 | This closes the CDB database handle, the handle may be NULL, if so, 494 | nothing will be done. The same handle should not be passed in twice 495 | to "cdb\_close" as this can cause double-free errors. This function 496 | will release any memory and handles (by calling the "close" callback) 497 | associated with the handle. 498 | 499 | When writing a database this function has one more task to do, and 500 | that is finalizing the database, it writes out the hash-table at 501 | the end of the file. If "cbd\_close" is not called after the 502 | last entry has been added then the database will be in an invalid 503 | state and will not work. 504 | 505 | This function may return negative on error, for example if the 506 | finalization fails. 507 | 508 | After calling "cdb\_close" the handle *must not* be used again. 509 | 510 | * cdb\_read 511 | 512 | To be used on a database opened up in read-mode only. This can 513 | be used to read values, and sometimes keys, from the database. This 514 | function does not call "cdb\_seek", the caller must call "cdb\_seek" 515 | before calling this function to move the file pointer to the 516 | desired location before reading. The file pointer will be updated 517 | to point to after the location that has been read (or more accurately, 518 | the read callback must do this). This function *does not* return the 519 | number of bytes read, instead it returns zero for no error and 520 | negative if an error condition occurs (a partial read is treated as 521 | an error). 522 | 523 | * cdb\_add 524 | 525 | To be used on a database opened up in write, or creation, mode only. 526 | 527 | This function adds a key-value pair to the database, which can be 528 | looked up only after finalizing the database (by calling "cdb\_close") 529 | and reopening the database in read-only mode, which should be done 530 | after the final "cdb\_add" has been added. 531 | 532 | It is unfortunate that both the key and value must reside within 533 | memory, but doing anything else would complicate the API too much. 534 | 535 | One the key and value have been added they can be freed or discarded 536 | however. 537 | 538 | Adding key-value pairs consumes disk space and some extra memory 539 | which is needed to store the second level hash table, however the 540 | keys and values are not kept around in memory by the CDB library. 541 | 542 | Note that this function will add duplicate keys without complaining, 543 | and can add zero length keys and values, likewise without complaining. 544 | 545 | It is entirely up to the caller to prevent duplicates from being 546 | added. This is one improvement that could be added to the library (as 547 | you cannot check or query a partially written database at the 548 | moment). 549 | 550 | 551 | * cdb\_seek 552 | 553 | This function changes the position that the next read or write 554 | will occur from. You should not seek before or after the database, 555 | doing so will result in an error. Seeking is always relative to the 556 | start of the file, the optional offset specified in the CDB options 557 | structure being added to the current position. Relative to current 558 | position or file-end seeks cannot be done. 559 | 560 | This function must be called before each call to "cdb\_read" or 561 | "cdb\_read\_word\_pair", otherwise you may read garbage. 562 | 563 | Calling "cdb\_seek" multiple times on the same location has no 564 | effect (the "fseek" C standard library function may discard buffers 565 | if called multiple times on the same location even though the file 566 | position has not changed). 567 | 568 | * cdb\_foreach 569 | 570 | The "cdb\_foreach" function calls a callback for each value within 571 | the CDB database. The callback is passed an optional "param". If 572 | the callback returns negative or a non-zero number then the for-each 573 | loop is terminated early (a positive number is returned, a negative 574 | number results in -1 being returned). If the callback returns zero 575 | then the next value, if any, is processed with the callback being 576 | called again. 577 | 578 | The callback is passed a structure which contains the location 579 | within the CDB database that contains the key and value. The keys 580 | and values are not presented in any specific order and the order 581 | should not be expected to stay the same between calls. 582 | 583 | To read either a key or a value you must call "cdb\_seek" before 584 | calling "cdb\_read" yourself. 585 | 586 | Passing in NULL is allowed and is not a No-Operation, it can be 587 | used to effectively check the integrity of the database. 588 | 589 | * cdb\_read\_word\_pair 590 | 591 | To be used on a database opened up in read-mode only. This function 592 | is a helper function that strictly does not need to exist, it is 593 | used for reading two "cdb\_word\_t" values from the database. This 594 | can be useful for the library user for more detailed analysis of 595 | the database than would normally be possible, many values within 596 | the database are stored as two "cdb\_word\_t" values. Looking inside this 597 | read-only database is not discouraged and the file format is well 598 | documented. 599 | 600 | This function does not call "cdb\_seek", that must be called 601 | before hand to seek to the desired file location. The file position 602 | will be updated to point after the two read values. 603 | 604 | * cdb\_get 605 | 606 | This function populates the "value" structure if the "key" is found 607 | within the CDB database. The members of "value" will be set to zero 608 | if a key is not found, if it is found the position will be non-zero, 609 | although the length may be zero. 610 | 611 | Note that this function does not actually retrieve the key and put it 612 | into a buffer, there is a very good reason for that. It would be easy 613 | enough to make such a function given the functions present in this 614 | API, however in order to make such a function it would have to do 615 | the following; allocate enough space to store the value, read the 616 | value off of disk and then return the result. This has massive performance 617 | implications. Imagine if a large value is stored in the database, say 618 | a 1GiB value, this would mean at least 1GiB of memory would need to 619 | be allocated, it would also mean all of the file buffers would have 620 | been flushed and refilled, and all of that data would need to be copied 621 | from disk to memory. This might be desired, it might also be *very* 622 | wasteful, especially if only a fraction of the value is actually 623 | needed (say the first few hundred bytes). Whether this is wasteful 624 | depends entirely on your workload and use-cases for the database. 625 | 626 | It is better to give the user tools to do what they need than insisting 627 | it be done one, limiting, although "easy", way. 628 | 629 | This does mean that to actually retrieve the value the user must 630 | perform their own "cdb\_seek" and "cdb\_read" operations. This 631 | means that the entire value does not need to read into memory 632 | be the consumer, and potentially be processed block by block by 633 | the "read" callback if needed. 634 | 635 | * cdb\_lookup 636 | 637 | "cdb\_lookup" is similar to "cdb\_get" except it accepts an 638 | optional record number. Everything that applies to the get-function 639 | applies to the lookup-function, the only difference is the record 640 | number argument (internally "cdb\_get" is implemented with 641 | "cdb\_lookup"). 642 | 643 | If there are two or more keys that are identical then the question 644 | of how to select a specific key arises. This is done with an 645 | arbitrary number that will most likely, but is not guaranteed, to 646 | be the order in which the key was added into the database, with the 647 | first value being zero and the index being incremented from there 648 | on out. 649 | 650 | If the key is found but the index is out of bounds it is treated 651 | as if the key does not exist. Use "cdb\_count" to calculate the 652 | maximum number records per key if needed, it is far more expensive 653 | to repeatedly call "cdb\_lookup" on a key until it returns "key 654 | not found" to determine the number of duplicate keys than it is 655 | to call "cdb\_count". 656 | 657 | The index argument perhaps should be a "cdb\_word\_t", but there 658 | is always debate around these topics (personally if I were to 659 | design a C-like programming language everything integers would default 660 | to 64-bits and all pointers would fit within that, other types 661 | for indexing and the like would also be 64-bit, that's not a 662 | criticism of C, the madness around integer types was born out 663 | of necessity). 664 | 665 | * cdb\_count 666 | 667 | The "cdb\_count" function counts the number of entries that 668 | have the same key value. This function requires potentially multiple 669 | seeks and reads to compute, so the returned value should be cached if 670 | you plan on using it again as the value is expensive to calculate. 671 | 672 | If the key is not found, a value indicating that will be returned 673 | and the count argument will be zeroed. If found, the count will 674 | be put in the count argument. 675 | 676 | * cdb\_status 677 | 678 | This function returns the status of the CDB library handle. All 679 | errors are sticky in this library, if an error occurs when handling 680 | a CDB database then there is no way to clear that error short of 681 | reopening the database with a new handle. The only valid operation 682 | to do after getting an error from any of the functions that operate 683 | on a "cdb\_t" handle is to call "cdb\_status" to query the error 684 | value that is stored internally. 685 | 686 | "cdb\_status" should return a zero on no error and a negative value 687 | on failure. It should not return a positive non-zero value. 688 | 689 | * cdb\_version 690 | 691 | "cdb\_version" returns the version number of the library. It stores 692 | the value in an unsigned long. This may return an error value and a 693 | zero value if the version has not been set correctly at compile time. 694 | 695 | The value is stored in "MAJOR.MINOR.PATH" format, with "PATH" stored 696 | in the Least Significant Byte. This is a semantic version number. If 697 | the "MAJOR" number has changed then there are potentially breaking 698 | changes in the API or ABI of this library that have been introduced, 699 | no matter how trivial. 700 | 701 | * cdb\_tests 702 | 703 | And the callback for "cdb\_foreach": 704 | 705 | * "cdb\_callback" 706 | 707 | This callback is called for each value within the CDB database 708 | when used with "cdb\_foreach". If a negative value is returned from 709 | this callback then the foreach loop will end early and an error value 710 | will be returned. If the value returned is greater than zero then 711 | the foreach loop will terminate potentially early. If zero the 712 | foreach loop will continue to the next key-value pair if available. 713 | 714 | Each time this callback is called by "cdb\_foreach" it will be 715 | passed in a key-value pair in the form of two length/file-location 716 | structures. You will need to seek to those locations and call 717 | read the key-values yourself. There is no guarantee the file position 718 | is in the correct location (ie. Pointing to the location of the 719 | key), so call "cdb\_seek" before calling "cdb\_read". 720 | 721 | There is no guarantee that the key-value pairs will be presented 722 | in the same order each time the function is called and should not 723 | be counted on. There is no attempt to preserve order. 724 | 725 | See "cdb\_foreach" for more information. 726 | 727 | ## C API STRUCTURES 728 | 729 | The C API has two simple structures and one complex one, the latter being 730 | more of a container for callbacks (or, some might say, a way of doing 731 | object oriented programming in C). The complex structure, "cdb\_options\_t", 732 | is an unfortunate necessity. 733 | 734 | The other two structures, "cdb\_buffer\_t" and "cdb\_file\_pos\_t", are 735 | simple enough and need very little explanation, although they will be. 736 | 737 | Let us look at the "cdb\_options\_t" structure: 738 | 739 | typedef struct { 740 | void *(*allocator)(void *arena, void *ptr, size_t oldsz, size_t newsz); 741 | cdb_word_t (*hash)(const uint8_t *data, size_t length); 742 | int (*compare)(const void *a, const void *b, size_t length); 743 | cdb_word_t (*read)(void *file, void *buf, size_t length); 744 | cdb_word_t (*write)(void *file, void *buf, size_t length); 745 | int (*seek)(void *file, uint64_t offset); 746 | void *(*open)(const char *name, int mode); 747 | int (*close)(void *file); 748 | int (*flush)(void *file); 749 | 750 | void *arena; 751 | cdb_word_t offset; 752 | unsigned size; 753 | } cdb_options_t; 754 | 755 | Each member of the structure will need an explanation. 756 | 757 | ## STRUCTURE CALLBACKS 758 | 759 | * allocator 760 | 761 | This function is based off of the allocator callback mechanism 762 | present in Lua, see 763 | for more information on that allocator. This function can handle 764 | freeing memory, allocating memory, and reallocating memory, all 765 | in one function. This allows the user of this library to specify 766 | where objects are allocated and how. 767 | 768 | The arguments to the callback mean: 769 | 770 | 1. arena 771 | 772 | This may be NULL, it is an optional argument that can be used 773 | to store memory allocation statistics or as part of an arena 774 | allocator. 775 | 776 | 2. ptr 777 | 778 | This should be NULL if allocating new memory, of be a pointer 779 | to some previously allocated memory if freeing memory or 780 | reallocating it. 781 | 782 | 3. oldsz 783 | 784 | The old size of the pointer if known, if unknown, use zero. This is 785 | used to prevent unnecessary allocations. 786 | 787 | 4. newz 788 | 789 | The new size of the desired pointer, this should be non-zero 790 | if reallocating or allocating memory. To free memory set this 791 | to zero, along with providing a pointer to free. If this is zero 792 | and the "ptr" is NULL then nothing will happen. 793 | 794 | 5. The return value 795 | 796 | This will be NULL on failure if allocating memory or reallocating 797 | memory and that operation failed. It will be non-NULL on success, 798 | containing usable memory. If freeing memory this should return NULL. 799 | 800 | An example allocator using the built in allocation routines is: 801 | 802 | void *allocator_cb(void *arena, void *ptr, size_t oldsz, size_t newsz) { 803 | UNUSED(arena); 804 | if (newsz == 0) { 805 | free(ptr); 806 | return NULL; 807 | } 808 | if (newsz > oldsz) 809 | return realloc(ptr, newsz); 810 | return ptr; 811 | } 812 | 813 | This callback is both simple and flexible, and more importantly 814 | puts the control of allocating back to the user (I know I have 815 | repeated this *many* times throughout this document, but it is 816 | worth repeating!). 817 | 818 | compare: /* key comparison function: NULL defaults to memcmp */ 819 | write: https://roboquill.io/ 820 | flush: /* (optional) called at end of successful creation */ 821 | 822 | arena: /* used for 'arena' argument for the allocator, can be NULL if allocator allows it */ 823 | offset: /* starting offset for CDB file if not at beginning of file */ 824 | size: /* Either 0 (same as 32), 16, 32 or 64, but cannot be bigger than 'sizeof(cdb_word_t)*8' */ 825 | 826 | * hash (optional) 827 | 828 | The "hash" callback can be set to NULL, if that is the case then 829 | the default hash, based off of djb2 and present in the original 830 | CDB library, will be used. If you do provide your own hash function 831 | you will effectively make this database incompatible with the standard 832 | CDB format but there are valid reasons for you do do this, you might 833 | need a stronger hash that is more resistant to denial of service attacks, 834 | or perhaps you want similar keys to *collide* more to group them together. 835 | 836 | The hash function returns "cdb\_word\_t" so the number of bits this 837 | function returns is dependent on big that type is (determined at 838 | compile time). 839 | 840 | * compare (optional) 841 | 842 | This function compares keys for a match, the function should behave like 843 | [memcmp][], returning the same values on a match and a failure. You 844 | may want to change this function if you want to compare keys partially, 845 | however you will also need to change the hash function to ensure keys are 846 | sorted into the right 256 buckets for your comparison (for example, with 847 | the default hash function two keys with the same prefix could be stored in 848 | two separate buckets). 849 | 850 | ### FILE CALLBACKS 851 | 852 | The following callbacks act in a similar way to the file functions present 853 | in [stdio.h][]. The only function missing is an [ftell][] equivalent. 854 | 855 | * read 856 | 857 | This function is used to read data out of the database, wherever that 858 | data is stored. Unlike [fread][] a status code is returned instead of 859 | the length of the data read, negative indicating failure. A partial read 860 | should result in a failure. The only thing lacking from this callback 861 | is a way to signal to perform non-blocking Input and Output, that would 862 | complicate the internals however. The "read" callback should always be 863 | present. 864 | 865 | The first parameter, "file", is a handle to an object returned by the 866 | "open" callback. 867 | 868 | The callback should return 0 indicating no error if "length" bytes have 869 | been read into "buf". 870 | 871 | Reading should continue from the previous file pointer position, that 872 | is if you open a file handle, read X bytes, the next time you read Y 873 | bytes they should be read from the end of the X bytes and not the 874 | beginning of the file (hence why read does not take a file position). 875 | 876 | If implementing read callbacks in an embedded system you might have to 877 | also implement that behavior. 878 | 879 | * write (conditionally optional, needed for database creation only) 880 | 881 | Similar to the "read" callback, but instead writes data into wherever 882 | the database is stored. 883 | 884 | * seek 885 | 886 | This callback sets the file position that subsequent reads and writes 887 | occur from. 888 | 889 | * open 890 | 891 | This callback should open the resource specified by the "name" string 892 | (which will usually be a file name). There are two modes a read/write 893 | mode (used to create the database) and a read-only mode. This callback 894 | much like the "close" callback will only be called once internally 895 | by the CDB library. 896 | 897 | * close 898 | 899 | This callback should close the file handle returned by "open", freeing 900 | any resources associated with that handle. 901 | 902 | * flush (optional) 903 | 904 | An optional callback used for flushing writes to mass-storage. If NULL 905 | then the function will not be called. 906 | 907 | ## STRUCTURE VARIABLES 908 | 909 | * arena (optional, can be NULL, depends on your allocator) 910 | 911 | This value is passed into the allocator as the "arena" argument whenever 912 | the allocator is called. It can be NULL, which will usually be the case 913 | if you are just using "malloc", "realloc" and "free" to implement the 914 | allocator, but if you are implementing your own arena based allocator you 915 | might want to set it to point to your arena (hence the name). 916 | 917 | * offset 918 | 919 | This offset can be used for CDB databases embedded within a file. If 920 | the CDB database does not begin at the start of the file (or flash, or 921 | wherever) then you can set this offset to skip over that many number 922 | of bytes in the file. 923 | 924 | * size 925 | 926 | The size variable, which can be left at zero, is used to select 927 | the word size of the database, this has an interaction with "cdb\_word\_t". 928 | 929 | Missing perhaps is a unsigned field that could contain options 930 | in each bit position in that field. 931 | 932 | 933 | ## BUFFER STRUCTURE 934 | 935 | typedef struct { 936 | cdb_word_t length; /* length of data */ 937 | char *buffer; /* pointer to arbitrary data */ 938 | } cdb_buffer_t; /* used to represent a key or value in memory */ 939 | 940 | ## FILE POSITION STRUCTURE 941 | 942 | typedef struct { 943 | cdb_word_t position; /* position in file, for use with cdb_read/cdb_seek */ 944 | cdb_word_t length; /* length of data on disk, for use with cdb_read */ 945 | } cdb_file_pos_t; /* used to represent a value on disk that can be accessed via 'cdb_options_t' */ 946 | 947 | ## EMBEDDED SUITABILITY 948 | 949 | There are many libraries written in C, for better or worse, as it is the 950 | lingua franca for software development at the moment. Few of those libraries 951 | are directly suitable for use in [Embedded systems][] and are much less 952 | flexible than they could be in general. Embedded systems pose some interesting 953 | constraints (eschewing allocation via "malloc", lack of a file-system, and 954 | more). By designing the library for an embedded system we can make a library 955 | more useful not only for those systems but for hosted systems as well (eg. By 956 | providing callbacks for the FILE functions we can redirect them to wherever 957 | we like, the CDB file could be stored remotely and accessed via TCP, or it 958 | could be stored locally using a normal file, or it could be stored in memory). 959 | 960 | There are two sets of functions that should be abstracted out in nearly 961 | every library, memory allocation (or even better, the caller can pass in 962 | fixed length structures if possible) and Input/Output functions (including 963 | logging!). This library does both. 964 | 965 | There is one area in which the library is lacking, the I/O functions do not 966 | yield if there is nothing to read yet, or a write operation is taking too 967 | long. This does impose constraints on the caller and how the library is used 968 | (all calls to the library could block for an arbitrary length of time). The 969 | callbacks could return a status indicating the caller should yield, but 970 | yielding and restoring state to enable partially completed I/O to finish 971 | would greatly complicate the library (this would be trivial to implement if 972 | C had portable coroutines built into the language). 973 | 974 | More libraries should be written with this information in mind. 975 | 976 | ## TEST SUITE 977 | 978 | There is a special note that should be mentioned about how the test suite 979 | is handled as it is important. 980 | 981 | It is difficult to make a good API that is easy to use, consistent, and 982 | difficult to *misuse*. Bad APIs abound in common and critical software 983 | (names will not be named) and can make an already difficult to use language 984 | like C even more difficult to use. 985 | 986 | One mistake that is often seen is API functionality that is conditional 987 | upon an macro. This complicates the build system along with every piece of 988 | software that is dependent on those optional calls. The most common function 989 | to be optionally compiled in are test suite related functions if they are 990 | present. For good reason these test suites might need to be removed from builds 991 | (as they might take up large amounts of space for code even if they are not 992 | needed, which is at a premium in embedded systems with limited flash memory). 993 | 994 | The header often contains code like this: 995 | 996 | #ifdef LIBRARY_UNIT_TESTS 997 | int library_unit_tests(void); 998 | #endif 999 | 1000 | And the code like this, in C like pseudo-code: 1001 | 1002 | #ifdef LIBRARY_UNIT_TESTS 1003 | int test_function_1(void) { 1004 | /* might call malloc directly, making this unsuitable 1005 | to be included in an embedded system */ 1006 | return result; 1007 | } 1008 | 1009 | int library_unit_tests(void) { 1010 | /* tests go here */ 1011 | if (test_function_1() != OK) 1012 | return FAIL; 1013 | return PASS; 1014 | } 1015 | #endif 1016 | 1017 | 1018 | In order to call this code you need to be aware of the "LIBRARY\_UNIT\_TESTS" 1019 | macro each time the function "library\_unit\_tests" is called, and worse, 1020 | whether or not your library was compiled with that macro enabled resulting 1021 | in link-time errors. Another common mistake is not passing in the functions 1022 | for I/O and allocation to the unit test framework, making it unsuitable for 1023 | embedded use (but that is a common criticism for many C libraries and not 1024 | just unit tests). 1025 | 1026 | Compare this to this libraries way of handling unit tests: 1027 | 1028 | In the header: 1029 | 1030 | int cdb_tests(const cdb_options_t *ops, const char *test_file); 1031 | 1032 | And the *relevant* bits of code/pseudo-code: 1033 | 1034 | static uint64_t xorshift128(uint64_t s[2]) { 1035 | assert(s); 1036 | /* XORSHIFT-128 algorithm */ 1037 | return NEXT_PRNG; 1038 | } 1039 | 1040 | 1041 | int cdb_tests(const cdb_options_t *ops, const char *test_file) { 1042 | assert(ops); 1043 | assert(test_file); 1044 | BUILD_BUG_ON(sizeof (cdb_word_t) < 2); 1045 | 1046 | if (CDB_TESTS_ON == 0) 1047 | return CDB_OK_E; 1048 | 1049 | /* LOTS OF TEST CODE NOT SHOWN, some of which 1050 | uses "xorshift128". */ 1051 | 1052 | return STATUS; 1053 | } 1054 | 1055 | There is no "ifdef" surrounding any of the code (using "ifdef" anywhere to 1056 | conditionally execute code is usually a mistake, is only used within the 1057 | project to set default macro values if the macro is not previously 1058 | defined, an acceptable usage). 1059 | 1060 | Two things are important here, the first, all of the Input and Output 1061 | and memory related functions are passed in via the "ops" structure, 1062 | as mentioned. This means that the test code is easy to port and run on 1063 | a microcontroller which might not have a file system (for testing and 1064 | development purposes you might want to run the tests on a microcontroller 1065 | but not keep them in in the final product). 1066 | 1067 | The main difference is the lack of "ifdef" guards, instead if the macro 1068 | "CDB\_TESTS\_ON" is false the function "cdb\_tests" returns "CDB\_OK\_E" 1069 | (there is some debate if the return code should be this, or something 1070 | to indicate the tests are not present, but that is a separate issue, the 1071 | important bit is the return depending on whether the tests are present). 1072 | 1073 | This "if" statement is a *far superior* way of handling optional code in 1074 | general. The caller does not have to worry if the function is present or 1075 | not, as the function will always be present in the library. Not only that, 1076 | but if the tests are not run because the compile time macro "CDB\_TESTS\_ON" 1077 | is false then the compiler will optimize out those tests even on the lowest 1078 | optimization settings (on any decent compiler). 1079 | 1080 | This also has the advantage that the code that is not run still goes 1081 | through the compilation step meaning the code is less likely to be wrong 1082 | when refactoring code. Not only that, but because "xorshift128" which 1083 | "cdb\_tests" depends on, is declared to be static, if "CDB\_TESTS\_ON" is 1084 | false it to will be eliminated from the compiled object file so long as no 1085 | other function calls it. In actual fact, the code has changed since 1086 | this has been written and "cdb\_prng" is exposed in the header as it is 1087 | useful in [main.c][], which is equivalent to "xorshift128". 1088 | 1089 | # BUILD REQUIREMENTS 1090 | 1091 | If you are building the program from the repository at 1092 | you will need [GNU Make][] and a [C 1093 | Compiler][]. The library is written in pure [C99][] and should be fairly 1094 | simple to port to another platform. Other [Make][] implementations may 1095 | work, however they have not been tested. [git][] is also used as part of 1096 | the build system. 1097 | 1098 | First clone the repository and change directory to the newly clone repository: 1099 | 1100 | git clone https://github.com/howerj/cdb cdb 1101 | cd cdb 1102 | 1103 | Type 'make' to build the *cdb* executable and library. 1104 | 1105 | Type 'make test' to build and run the *cdb* internal tests. The script called 1106 | 't', written in [sh][], does more testing, and tests that the user interface 1107 | is working correctly. 'make dist' is used to create a compressed tar file for 1108 | distribution. 'make install' can be used to install the binaries, however the 1109 | default installation directory (which can be set with the 'DESTDIR' makefile 1110 | variable) installs to a directory called 'install' within the repository - 1111 | it will not actually install anything. Changing 'DESTDIR' to '/usr' should 1112 | install everything properly. [pandoc][] is required to build the manual page 1113 | for installation, which is generated from this [markdown][] file. 1114 | 1115 | Look at the source file [cdb.c][] to see what compile time options can be 1116 | passed to the compiler to enable and disable features (if code size is a 1117 | concern then the ability to create databases can be removed, for example). 1118 | 1119 | # RENAME 1120 | 1121 | CDB databases are meant to be read-only, in order to add entries to 1122 | a database that database should be dumped and new values added in along 1123 | with the old ones. That is, to add in a new value to the database the 1124 | entire database has to be rebuilt. This is not a problem for *some* work 1125 | loads, for *some* work loads the database could be rebuilt every X hours. 1126 | 1127 | If this does present a problem, then you should not use this database. 1128 | 1129 | However, when a database does have to be rebuilt how do you make sure 1130 | that users of it point to the new database and not the old one? 1131 | 1132 | If you access the database via the command line applications then 1133 | the "[rename][]" function, which is atomic on POSIX systems, will do 1134 | what is needed. This is, a mechanism to swap out the old database with 1135 | a new one without affecting any of the current readers. 1136 | 1137 | A rename can be done in C like so: 1138 | 1139 | rename("new.cdb", "current.cdb"); /* Atomic rename */ 1140 | 1141 | If a reader opens "current.cdb" before the rename then it will continue 1142 | to read the old database until it closes the handle and opens up "current.cdb" 1143 | after the rename. The files data persists even if there is no file name that 1144 | points to it so long as there are active users of that file (ie. If a file 1145 | handle to that file is still open). This will mean that there could be 1146 | processes that use old data, but not inconsistent data. If a reader opens 1147 | up the data after the rename, it will get the new data. 1148 | 1149 | This also means that the writer should never write to a file that is 1150 | currently in use by other readers or writers, it should write to a new 1151 | file that will be renamed to the file in use, and it also means that a 1152 | large amount of disk storage space will be in use until all users of 1153 | the old databases switch to the new databases allowing the disk space 1154 | to be reclaimed by the operating system. 1155 | 1156 | # POSSIBLE DIRECTIONS 1157 | 1158 | There are many additions that could be made to a project, however the 1159 | code is quite compact and neat, anything else that is needed could be built 1160 | on top of this library. Some ideas for improvement include; adding a header 1161 | along with a [CRC][], adding (unsafe) functions for rewriting key-values, 1162 | adding (de)compression (with the [shrink][] library) and decryption, 1163 | integrating the project in an embedded system in conjunction with [littlefs][] 1164 | as an example, allowing the user to supply their own comparison and hash 1165 | functions, adding types and schemas to the database, and more. The project 1166 | could also be used as the primary database library for the [pickle][] 1167 | interpreter, or for serving static content in the [eweb][] web-server. 1168 | 1169 | All of these would add complexity, and more code - making it more useful 1170 | to some and less to others. As such, apart from bugs, the library and test 1171 | driver programs should be considered complete. 1172 | 1173 | The lack of a header might be solved in creative ways as: 1174 | 1175 | * The integrity of most of the file can be checked by making sure all pointers are 1176 | within bounds, that key-value pairs are stored one after another and that 1177 | each key is in the right bucket for that hash. The only things not checked 1178 | would be the values (they would still have to be of the right length). 1179 | * If a file successfully passes a verification it can be identified as a valid 1180 | CDB file of that size, this means we would not need to store header 1181 | information about the file type and structure. This has been verified 1182 | experimentally (the empty and randomly generated databases of a different 1183 | size do not pass verification when the incorrect size is specified with 1184 | the "-b" option). 1185 | * We could place the header within the key-value section of the database, or 1186 | even at the end of the file. 1187 | 1188 | Things that *should* and *could* be done, but have not: 1189 | 1190 | * Fuzzing with [American Fuzzy Lop][] to iron out the most egregious 1191 | bugs, security relevant or otherwise. This has been used on the [pickle][] 1192 | library to great effect and it finds bugs that would not be caught be unit 1193 | testing alone. **The library is currently undergoing fuzzing, nothing 1194 | bad found so far**. 1195 | * The current library implements a system for looking up data 1196 | stored to disk, a *system* could be created that does so much more. 1197 | Amongst the things that could be done are: 1198 | - Using the CDB file format only as a serialization format 1199 | for an in memory database which would allow key deletion/replacing. 1200 | This Key-Value store would essentially just be an in memory hash 1201 | table with a fancy name, backed by this library. The project could 1202 | be done as part of this library or as a separate project. 1203 | - Implementing the [memcached protocol][] to allow remote querying 1204 | of data. 1205 | - Alternatively make a custom protocol that accept commands over 1206 | UDP. 1207 | There are a few implementation strategies for doing this. 1208 | * Alternatively, just a simple Key-Value store that uses this database 1209 | as a back-end without anything else fancy. 1210 | * Changing the library interface so it is a [header only][] C library. 1211 | * Making a set of callbacks to allow an in memory CDB database, useful 1212 | for embedding the database within binaries. 1213 | * Designing a suite of benchmarks for similar databases and implementations 1214 | of CDB, much like . 1215 | 1216 | Porting this to Rust and making a crate for it would be nice, 1217 | [although implementations already exists](https://crates.io/search?q=cdb). 1218 | Just making bindings for this library would be a good initial step, along 1219 | with other languages. 1220 | 1221 | For more things that are possible to do: 1222 | 1223 | * The API supplies a for-each loop mechanism where the user supplies a 1224 | callback, an iterator based solution would be more flexible (but slightly 1225 | more error prone to use). 1226 | * The user can specify their own hash algorithm, using one with perhaps 1227 | better characteristics for their purposes (and breaking compatibility 1228 | with the original format). One interesting possibility is using a hashing 1229 | algorithm that maximizes collisions of similar keys, so similar keys are 1230 | grouped together which may be useful when iterating over the database. 1231 | Unfortunately the initial 256 wide bucket system interferes with this, 1232 | which could be remedied by returning zero for lowest eight bits, degrading 1233 | performance. It is not really viable to do this with this system, but 1234 | hashing algorithms that maximize collisions, such as [SOUNDEX][], are 1235 | interesting and deserve a mention. This could be paired with a user 1236 | supplied comparison function for comparing the keys themselves. 1237 | * The callbacks for the file access words ("open", "read", ...) deserve 1238 | their own structure so it can be reused, as the allocator can, although 1239 | it may require some changes to how those functions work (such as different 1240 | return values, passing in a handle to arbitrary user supplied data, and 1241 | more). 1242 | * Options for making the file checking more lax, as information could 1243 | be stored between the different key/value pairs making the file format 1244 | semi-compatible between implementations. This could be information usually 1245 | stored in the header, or information about the key/values themselves (such 1246 | as type information). Some implementations, including this one, are 1247 | more strict in what they accept. 1248 | * Some of the functions in [main.c][] could be moved into [cdb.c][] so 1249 | users do not have to reimplement them. 1250 | * A poor performance [Bloom Filter][] like algorithm can be made 1251 | using the first level hash table. A function to return whether an 1252 | item may be in the set or is definitely not can be made by checking 1253 | whether there are any items in the first 256 bucket that key hashes 1254 | to. The 256 bucket is small enough to fit in memory, as are the second 1255 | level hash tables which could be used to improve performance even more. 1256 | * If the user presorts the keys when adding the data then the keys can 1257 | be retrieved in order using the "foreach" API call. The user could sort 1258 | on the data instead if they like. 1259 | * The way version information is communicated within the API is not 1260 | perhaps the best way of doing it. A simple macro would suffice. 1261 | * The file format really could use a redesign. One improvement apart 1262 | from adding a header would be to move the 256 bucket initial hash table 1263 | to the end of the file so the entire file format could be streamed to 1264 | disk. 1265 | 1266 | # BUGS 1267 | 1268 | For any bugs, email the [author][]. It comes with a 'works on my machine 1269 | guarantee'. The code has been written with the intention of being portable, 1270 | and should work on 32-bit and 64-bit machines. It is tested more frequently 1271 | on a 64-bit Linux machine, and less frequently on Windows. Please give a 1272 | detailed bug report (including but not limited to what machine/OS you are 1273 | running on, compiler, compiler version, a failing example test case, your 1274 | blood type and star sign, etcetera). 1275 | 1276 | # PYTHON IMPLEMENTATION 1277 | 1278 | Available from here 1279 | . It 1280 | probably is the most succinct description and understandable by someone 1281 | not versed in python. 1282 | 1283 | #!/usr/bin/env python 1284 | 1285 | # Python implementation of cdb 1286 | 1287 | # calc hash value with a given key 1288 | def calc_hash(s): 1289 | return reduce(lambda h,c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381) 1290 | 1291 | # cdbget(fp, basepos, key) 1292 | def cdbget(fp, pos_header, k): 1293 | from struct import unpack 1294 | 1295 | r = [] 1296 | h = calc_hash(k) 1297 | 1298 | fp.seek(pos_header + (h % 256)*(4+4)) 1299 | (pos_bucket, ncells) = unpack('> 8) % ncells 1303 | for i in range(ncells): 1304 | fp.seek(pos_bucket + ((start+i) % ncells)*(4+4)) 1305 | (h1, p1) = unpack('> 8) % ncells 1352 | while cell[i][1]: # is call[i] already occupied? 1353 | i = (i+1) % ncells 1354 | cell[i] = (h,p) 1355 | for (h,p) in cell: 1356 | fp.write(pack(' /dev/null; 16 | time -p cdb -s "${PERFORMANCE}" > /dev/null; 17 | time -p cdbstats < "${PERFORMANCE}" > /dev/null; 18 | 19 | time -p ./cdb -d "${PERFORMANCE}" > /dev/null; 20 | time -p cdb -d "${PERFORMANCE}" > /dev/null; 21 | time -p cdbdump < "${PERFORMANCE}" > /dev/null; 22 | 23 | ./cdb -d "${PERFORMANCE}" > 1.txt; 24 | cdb -d "${PERFORMANCE}" > 2.txt; 25 | cdbdump < "${PERFORMANCE}" > 3.txt; 26 | 27 | time -p ./cdb -c 1.cdb < 1.txt; 28 | time -p cdb -c 2.cdb < 2.txt; 29 | time -p cdbmake 3.cdb temp.cdb < 3.txt; 30 | } 31 | 32 | usage () { 33 | HELP=$(cat < bist.txt; 67 | ./${CDB} -b ${SIZE} -c copy.cdb -T temp.cdb < bist.txt; 68 | ./${CDB} -b ${SIZE} -d copy.cdb | sort > copy.txt; 69 | diff -w bist.txt copy.txt; 70 | 71 | ./${CDB} -b ${SIZE} -c ${TESTDB} <X 73 | +1,0:X-> 74 | +1,1:a->b 75 | +1,1:a->b 76 | +1,1:a->c 77 | +1,5:b->hello 78 | +1,5:c->world 79 | +4,7:open->seasame 80 | EOF 81 | set +x; 82 | 83 | t() { 84 | R=$(eval "${1}"); 85 | if [ "${R}" != "${2}" ]; then 86 | echo "FAIL: '${1}' != '${2}'"; 87 | FAIL=1; 88 | else 89 | echo "ok: '${1}' = '${2}'"; 90 | fi; 91 | } 92 | 93 | f() { 94 | C=1 95 | R=$(eval "${1}") || C=$?; 96 | if [ "${R}" = "0" ]; then 97 | echo "FAIL: '${1} == ${2}' expected a failure"; 98 | FAIL=1; 99 | else 100 | echo "ok: '${1}' reports failure as expected: ${C}/${R}"; 101 | fi; 102 | } 103 | 104 | t "./${CDB} -b ${SIZE} -q ${TESTDB} a" b; 105 | t "./${CDB} -b ${SIZE} -q ${TESTDB} a 0" b; 106 | t "./${CDB} -b ${SIZE} -q ${TESTDB} a 1" b; 107 | t "./${CDB} -b ${SIZE} -q ${TESTDB} a 2" c; 108 | f "./${CDB} -b ${SIZE} -q ${TESTDB} a 3"; 109 | t "./${CDB} -b ${SIZE} -q ${TESTDB} X" ""; 110 | f "./${CDB} -b ${SIZE} -q ${TESTDB} XXX"; 111 | t "./${CDB} -b ${SIZE} -q ${TESTDB} \"\"" X; 112 | t "./${CDB} -b ${SIZE} -q ${TESTDB} b" hello; 113 | t "./${CDB} -b ${SIZE} -q ${TESTDB} c" world; 114 | t "./${CDB} -b ${SIZE} -q ${TESTDB} open" seasame; 115 | 116 | for i in $(seq 0 9); do 117 | for j in $(seq 0 9); do 118 | for k in $(seq 0 9); do 119 | KEY="${i}${j}${k}" 120 | VAL="${i}${j}${k}" 121 | echo "+${#KEY},${#VAL}:${KEY}->${VAL}"; 122 | done; 123 | done; 124 | done > seq.txt; 125 | echo > seq.txt 126 | 127 | dd if=/dev/zero of=invalid-1.cdb count=1 # Too small 128 | dd if=/dev/zero of=invalid-2.cdb count=4 # Invalid hash table pointers 129 | #dd if=${RANDOMSRC} of=invalid-3.cdb count=512 130 | 131 | f "./${CDB} -b ${SIZE} -s invalid-1.cdb" 132 | f "./${CDB} -b ${SIZE} -s invalid-2.cdb" 133 | #f "./${CDB} -s invalid-3.cdb" 134 | f "./${CDB} -b ${SIZE} -s /dev/null" 135 | 136 | set -x 137 | 138 | ./${CDB} -b ${SIZE} -c seq.cdb < seq.txt; 139 | ./${CDB} -b ${SIZE} -d seq.cdb | sort > qes.txt; 140 | 141 | diff -w seq.txt qes.txt; 142 | 143 | ./${CDB} -b ${SIZE} -s ${EMPTYDB} 144 | ./${CDB} -b ${SIZE} -s seq.cdb; 145 | ./${CDB} -b ${SIZE} -s ${TESTDB} 146 | ./${CDB} -b ${SIZE} -s bist.cdb; 147 | 148 | dd if=/dev/zero of=offset.bin count=5 bs=512 149 | cat offset.bin test.cdb > offset.cdb 150 | ./${CDB} -o 2560 -b ${SIZE} -V offset.cdb; 151 | 152 | set +x; 153 | done; 154 | 155 | make clean 156 | exit ${FAIL}; 157 | --------------------------------------------------------------------------------