├── .gitignore
├── LICENSE
├── cdb.c
├── cdb.h
├── host.c
├── host.h
├── main.c
├── makefile
├── readme.md
└── t


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.1
 2 | *.a
 3 | *.bin
 4 | *.cdb
 5 | *.db
 6 | *.dll
 7 | *.exe
 8 | *.o
 9 | *.so
10 | *.tgz
11 | *.tmp
12 | *.txt
13 | cdb
14 | install/*
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/cdb.c:
--------------------------------------------------------------------------------
  1 | /* Program: Constant Database Library
  2 |  * Author:  Richard James Howe
  3 |  * Email:   howe.r.j.89@gmail.com
  4 |  * License: Unlicense
  5 |  * Repo:    <https://github.com/howerj/cdb>
  6 |  *
  7 |  * Consult the "readme.md" file for a detailed description
  8 |  * of the file format and internals. */
  9 | 
 10 | #include "cdb.h"
 11 | #include <assert.h>
 12 | #include <stdint.h>
 13 | #include <string.h>
 14 | #include <limits.h>
 15 | 
 16 | #ifndef CDB_VERSION
 17 | #define CDB_VERSION (0x000000ul) /* all zeros = built incorrectly (set in makefile) */
 18 | #endif
 19 | 
 20 | #ifndef CDB_TESTS_ON
 21 | #define CDB_TESTS_ON (1)
 22 | #endif
 23 | 
 24 | #ifndef CDB_WRITE_ON
 25 | #define CDB_WRITE_ON (1)
 26 | #endif
 27 | 
 28 | #ifndef CDB_MEMORY_INDEX_ON /* use in memory hash table if '1' for first table */
 29 | #define CDB_MEMORY_INDEX_ON (0)
 30 | #endif
 31 | 
 32 | #ifndef CDB_READ_BUFFER_LENGTH
 33 | #define CDB_READ_BUFFER_LENGTH      (256ul)
 34 | #endif
 35 | 
 36 | #ifndef cdb_assert
 37 | #define cdb_assert(X) (assert((X)))
 38 | #endif
 39 | 
 40 | #define cdb_implies(P, Q)           cdb_assert(!(P) || (Q))
 41 | 
 42 | #define CDB_BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 43 | #define CDB_MIN(X, Y)               ((X) < (Y) ? (X) : (Y))
 44 | #define CDB_NBUCKETS                (8ul)
 45 | #define CDB_BUCKETS                 (1ul << CDB_NBUCKETS)
 46 | #define CDB_FILE_START              (0ul)
 47 | 
 48 | /* This enumeration is here and not in the header deliberately, it is to
 49 |  * stop error codes becoming part of the API for this library. */
 50 | enum {
 51 | 	CDB_OK_E             =   0, /* no error */
 52 | 	CDB_NOT_FOUND_E      =   0, /* key: not-found */
 53 | 	CDB_FOUND_E          =   1, /* key: found */
 54 | 	CDB_ERROR_E          =  -1, /* generic error */
 55 | 	CDB_ERROR_HASH_E     =  -2, /* unexpected hash value given bucket */
 56 | 	CDB_ERROR_BOUND_E    =  -3, /* pointers out of bounds */
 57 | 	CDB_ERROR_OVERFLOW_E =  -4, /* some calculation overflowed and should not have */
 58 | 	CDB_ERROR_OPEN_E     =  -5, /* open failed */
 59 | 	CDB_ERROR_SEEK_E     =  -6, /* seek failed */
 60 | 	CDB_ERROR_WRITE_E    =  -7, /* write failed to write any/enough bytes */
 61 | 	CDB_ERROR_READ_E     =  -8, /* read failed to read any/enough bytes */
 62 | 	CDB_ERROR_ALLOCATE_E =  -9, /* reallocate/allocation failed */
 63 | 	CDB_ERROR_FREE_E     = -10, /* free failed */
 64 | 	CDB_ERROR_MODE_E     = -11, /* incorrect mode for operation */
 65 | 	CDB_ERROR_DISABLED_E = -12, /* unimplemented/disabled feature */
 66 | 	CDB_ERROR_SIZE_E     = -13, /* invalid/unsupported size */
 67 | };
 68 | 
 69 | typedef struct {
 70 | 	cdb_word_t position; /* position on disk of this hash table, when known */
 71 | 	cdb_word_t length;   /* number of buckets in hash table */
 72 | } cdb_hash_header_t; /* initial hash table structure */
 73 | 
 74 | /* NB. More is allocated than needed for the memory index, it
 75 |  * would make things ugly to correct this however, so it will not be. */
 76 | typedef struct {
 77 | 	cdb_word_t *hashes;       /* full key hashes */
 78 | 	cdb_word_t *fps;          /* file pointers */
 79 | 	cdb_hash_header_t header; /* header for this hash table */
 80 | } cdb_hash_table_t; /* secondary hash table structure */
 81 | 
 82 | struct cdb { /* constant database handle: for all your querying needs! */
 83 | 	cdb_options_t ops;     /* custom file/flash operators */
 84 | 	void	*file;         /* database handle */
 85 | 	cdb_word_t file_start, /* start position of structures in file */
 86 | 	       file_end,       /* end position of database in file, if known, zero otherwise */
 87 | 	       hash_start;     /* start of secondary hash tables near end of file, if known, zero otherwise */
 88 | 	cdb_word_t position;   /* read/write/seek position: be careful with this variable! */
 89 | 	int error;             /* error, if any, any error causes database to be invalid */
 90 | 	unsigned create : 1,   /* have we opened database up in create mode? */
 91 | 		 opened : 1,   /* have we successfully opened up the database? */
 92 | 		 empty  : 1,   /* is the database empty? */
 93 | 		 sought : 1;   /* have we performed at least one seek (needed to position init cache) */
 94 | 	cdb_hash_table_t table1[]; /* only allocated if in create mode, BUCKETS elements are allocated */
 95 | };
 96 | 
 97 | /* To make the library easier to use we could provide a set of default
 98 |  * allocators (that when compiled out always return an error), the non-thread
 99 |  * safe allocator would return a pointer to a statically declared variable and
100 |  * mark it as being used. */
101 | 
102 | int cdb_version(unsigned long *version) {
103 | 	CDB_BUILD_BUG_ON(sizeof(cdb_word_t) != 2 && sizeof(cdb_word_t) != 4 && sizeof(cdb_word_t) != 8);
104 | 	cdb_assert(version);
105 | 	unsigned long spec = ((sizeof (cdb_word_t)) * CHAR_BIT) >> 4; /* Lowest three bits = size */
106 | 	spec |= CDB_TESTS_ON        << 4;
107 | 	spec |= CDB_WRITE_ON        << 5;
108 | 	spec |= CDB_MEMORY_INDEX_ON << 6;
109 | 	/*spec |= 0                 << 7; */
110 | 	*version = (spec << 24) | CDB_VERSION;
111 | 	return CDB_VERSION == 0 ? CDB_ERROR_E : CDB_OK_E;
112 | }
113 | 
114 | int cdb_status(cdb_t *cdb) {
115 | 	cdb_assert(cdb);
116 | 	return cdb->error;
117 | }
118 | 
119 | static inline size_t cdb_get_size(cdb_t *cdb) {
120 | 	cdb_assert(cdb);
121 | 	return cdb->ops.size;
122 | }
123 | 
124 | static inline uint64_t cdb_get_mask(cdb_t *cdb) {
125 | 	cdb_assert(cdb);
126 | 	const size_t l = cdb_get_size(cdb);
127 | 	if (l == 16/CHAR_BIT)
128 | 		return UINT16_MAX;
129 | 	if (l == 32/CHAR_BIT)
130 | 		return UINT32_MAX;
131 | 	cdb_assert(l == 64/CHAR_BIT);
132 | 	return UINT64_MAX;
133 | }
134 | 
135 | /* This is not 'djb2' hash - the character is xor'ed in and not added. This
136 |  * has sometimes been called 'DJB2a'. */
137 | static inline uint32_t cdb_djb_hash(const uint8_t *s, const size_t length) {
138 | 	cdb_assert(s);
139 | 	uint32_t h = 5381ul;
140 | 	for (size_t i = 0; i < length; i++)
141 | 		h = ((h << 5ul) + h) ^ s[i]; /* (h * 33) xor c */
142 | 	return h;
143 | }
144 | 
145 | static int cdb_memory_compare(const void *a, const void *b, size_t length) {
146 | 	cdb_assert(a);
147 | 	cdb_assert(b);
148 | 	return memcmp(a, b, length);
149 | }
150 | 
151 | typedef cdb_word_t (*cdb_hash_fn)(const uint8_t *s, const size_t length);
152 | 
153 | cdb_word_t cdb_hash(const uint8_t *s, const size_t length) {
154 | 	cdb_assert(s);
155 | 	return cdb_djb_hash(s, length);
156 | }
157 | 
158 | /* A 64-bit hash has to be used for the 64-bit database version otherwise if 
159 |  * we used a 32-bit hash all of our keys and values would be
160 |  * stored...suboptimally. */
161 | static cdb_word_t cdb_hash64(const uint8_t *s, const size_t length) {
162 | 	cdb_assert(s);
163 | 	/* SDBM hash see: <http://www.cse.yorku.ca/~oz/hash.html>
164 | 	and <https://www.partow.net/programming/hashfunctions> */
165 | 	assert(sizeof(cdb_word_t) >= sizeof(uint64_t));
166 | 	uint64_t hash = 0xA5A5A5A5A5A5A5A5ull;
167 | 	for (size_t i = 0; i < length; i++) {
168 | 		const uint64_t ch = s[i];
169 | 		hash = ch + (hash << 6) + (hash << 16) - hash;
170 | 	}
171 | 	return hash;
172 | }
173 | 
174 | static void cdb_preconditions(cdb_t *cdb) {
175 | 	cdb_assert(cdb);
176 | 	cdb_implies(cdb->file_end   != 0, cdb->file_end   > cdb->file_start);
177 | 	cdb_implies(cdb->hash_start != 0, cdb->hash_start > cdb->file_start);
178 | 	cdb_assert(cdb->ops.allocator);
179 | 	cdb_assert(cdb->ops.read);
180 | 	cdb_assert(cdb->ops.open);
181 | 	cdb_assert(cdb->ops.close);
182 | 	cdb_assert(cdb->ops.seek);
183 | 	cdb_assert(cdb->error <= 0);
184 | 	cdb_implies(cdb->create, cdb->ops.write);
185 | 	/*cdb_assert(cdb->error == 0);*/
186 | }
187 | 
188 | static inline int cdb_failure(cdb_t *cdb) {
189 | 	cdb_preconditions(cdb);
190 | 	return cdb->error ? CDB_ERROR_E : CDB_OK_E;
191 | }
192 | 
193 | static inline int cdb_error(cdb_t *cdb, const int error) {
194 | 	cdb_preconditions(cdb);
195 | 	if (cdb->error == 0)
196 | 		cdb->error = error;
197 | 	return cdb_failure(cdb);
198 | }
199 | 
200 | static inline int cdb_bound_check(cdb_t *cdb, const int fail) {
201 | 	cdb_assert(cdb);
202 | 	return cdb_error(cdb, fail ? CDB_ERROR_BOUND_E : CDB_OK_E);
203 | }
204 | 
205 | static inline int cdb_hash_check(cdb_t *cdb, const int fail) {
206 | 	cdb_assert(cdb);
207 | 	return cdb_error(cdb, fail ? CDB_ERROR_HASH_E : CDB_OK_E);
208 | }
209 | 
210 | static inline int cdb_overflow_check(cdb_t *cdb, const int fail) {
211 | 	cdb_assert(cdb);
212 | 	return cdb_error(cdb, fail ? CDB_ERROR_OVERFLOW_E : CDB_OK_E);
213 | }
214 | 
215 | static inline int cdb_free(cdb_t *cdb, void *p) {
216 | 	cdb_assert(cdb);
217 | 	if (!p)
218 | 		return 0;
219 | 	(void)cdb->ops.allocator(cdb->ops.arena, p, 0, 0);
220 | 	return 0;
221 | }
222 | 
223 | static inline void *cdb_allocate(cdb_t *cdb, const size_t length) {
224 | 	cdb_preconditions(cdb);
225 | 	void *r = cdb->ops.allocator(cdb->ops.arena, NULL, 0, length);
226 | 	if (length != 0 && r == NULL)
227 | 		(void)cdb_error(cdb, CDB_ERROR_ALLOCATE_E);
228 | 	return r ? memset(r, 0, length) : NULL;
229 | }
230 | 
231 | static inline void *cdb_reallocate(cdb_t *cdb, void *pointer, const size_t length) {
232 | 	cdb_preconditions(cdb);
233 | 	void *r = cdb->ops.allocator(cdb->ops.arena, pointer, 0, length);
234 | 	if (length != 0 && r == NULL)
235 | 		(void)cdb_error(cdb, CDB_ERROR_ALLOCATE_E);
236 | 	return r;
237 | }
238 | 
239 | /* NB. A seek can cause buffers to be flushed, which degrades performance quite a lot */
240 | static int cdb_seek_internal(cdb_t *cdb, const cdb_word_t position) {
241 | 	cdb_preconditions(cdb);
242 | 	if (cdb->error)
243 | 		return -1;
244 | 	if (cdb->opened && cdb->create == 0)
245 | 		if (cdb_bound_check(cdb, position < cdb->file_start || cdb->file_end < position))
246 | 			return -1;
247 | 	if (cdb->sought == 1u && cdb->position == position)
248 | 		return cdb_error(cdb, CDB_OK_E);
249 | 	const int r = cdb->ops.seek(cdb->file, position + cdb->ops.offset);
250 | 	if (r >= 0) {
251 | 		cdb->position = position;
252 | 		cdb->sought = 1u;
253 | 	}
254 | 	return cdb_error(cdb, r < 0 ? CDB_ERROR_SEEK_E : CDB_OK_E);
255 | }
256 | 
257 | int cdb_seek(cdb_t *cdb, const cdb_word_t position) {
258 | 	cdb_preconditions(cdb);
259 | 	if (cdb_error(cdb, cdb->create != 0 ? CDB_ERROR_MODE_E : 0))
260 | 		return 0;
261 | 	return cdb_seek_internal(cdb, position);
262 | }
263 | 
264 | static cdb_word_t cdb_read_internal(cdb_t *cdb, void *buf, cdb_word_t length) {
265 | 	cdb_preconditions(cdb);
266 | 	cdb_assert(buf);
267 | 	if (cdb_error(cdb, cdb->create != 0 ? CDB_ERROR_MODE_E : 0))
268 | 		return 0;
269 | 	const cdb_word_t r = cdb->ops.read(cdb->file, buf, length);
270 | 	const cdb_word_t n = cdb->position + r;
271 | 	if (cdb_overflow_check(cdb, n < cdb->position) < 0)
272 | 		return 0;
273 | 	cdb->position = n;
274 | 	return r;
275 | }
276 | 
277 | int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length) {
278 | 	cdb_preconditions(cdb);
279 | 	const cdb_word_t r = cdb_read_internal(cdb, buf, length);
280 | 	return cdb_error(cdb, r != length ? CDB_ERROR_READ_E : 0);
281 | }
282 | 
283 | static cdb_word_t cdb_write(cdb_t *cdb, void *buf, size_t length) {
284 | 	cdb_preconditions(cdb);
285 | 	cdb_assert(buf);
286 | 	if (cdb_error(cdb, cdb->create == 0 ? CDB_ERROR_MODE_E : 0))
287 | 		return 0;
288 | 	const cdb_word_t r = cdb->ops.write(cdb->file, buf, length);
289 | 	const cdb_word_t n = cdb->position + r;
290 | 	if (cdb_overflow_check(cdb, n < cdb->position) < 0)
291 | 		return 0;
292 | 	if (r != length)
293 | 		return cdb_error(cdb, CDB_ERROR_WRITE_E);
294 | 	cdb->position = n;
295 | 	return r;
296 | }
297 | 
298 | static inline void cdb_pack(uint8_t b[/*static (sizeof (cdb_word_t))*/], cdb_word_t w, size_t l) {
299 | 	cdb_assert(b);
300 | 	for (size_t i = 0; i < l; i++)
301 | 		b[i] = (w >> (i * CHAR_BIT)) & 0xFFu;
302 | }
303 | 
304 | static inline cdb_word_t cdb_unpack(uint8_t b[/*static (sizeof (cdb_word_t))*/], size_t l) {
305 | 	cdb_assert(b);
306 | 	cdb_word_t w = 0;
307 | 	for (size_t i = 0; i < l; i++)
308 | 		w |= ((cdb_word_t)b[i]) << (i * CHAR_BIT);
309 | 	return w;
310 | }
311 | 
312 | int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2) {
313 | 	cdb_assert(cdb);
314 | 	cdb_assert(w1);
315 | 	cdb_assert(w2);
316 | 	const size_t l = cdb_get_size(cdb);
317 | 	/* we only need to set this to 'b' to a value to avoid static checkers
318 | 	 * signalling a problem, 'b' should be written to be
319 | 	 * 'cdb_read_internal' before it is used. */
320 | 	uint8_t b[2ul * sizeof(cdb_word_t)] = { 0, };
321 | 	const cdb_word_t r = cdb_read_internal(cdb, b, 2ul * l);
322 | 	if (r != (cdb_word_t)(2l * l))
323 | 		return -1;
324 | 	*w1 = cdb_unpack(b, l);
325 | 	*w2 = cdb_unpack(b + l, l);
326 | 	return 0;
327 | }
328 | 
329 | static int cdb_write_word_pair(cdb_t *cdb, const cdb_word_t w1, const cdb_word_t w2) {
330 | 	cdb_assert(cdb);
331 | 	const size_t l = cdb_get_size(cdb);
332 | 	uint8_t b[2ul * sizeof(cdb_word_t)]; /* NOT INITIALIZED */
333 | 	cdb_pack(b,     w1, l);
334 | 	cdb_pack(b + l, w2, l);
335 | 	if (cdb_write(cdb, b, 2ul * l) != (2ul * l))
336 | 		return -1;
337 | 	return 0;
338 | }
339 | 
340 | static int cdb_hash_free(cdb_t *cdb, cdb_hash_table_t *t) {
341 | 	cdb_assert(cdb);
342 | 	cdb_assert(t);
343 | 	const int r1 = cdb_free(cdb, t->hashes);
344 | 	const int r2 = cdb_free(cdb, t->fps);
345 | 	t->hashes = NULL;
346 | 	t->fps    = NULL;
347 | 	/* do not free t */
348 | 	return r1 < 0 || r2 < 0 ? -1 : 0;
349 | }
350 | 
351 | static int cdb_free_resources(cdb_t *cdb) {
352 | 	if (!cdb)
353 | 		return 0;
354 | 	if (cdb->file)
355 | 		cdb->ops.close(cdb->file);
356 | 	cdb->file = NULL;
357 | 	cdb->opened = 0;
358 | 	int r = 0;
359 | 	for (size_t i = 0; cdb->create && i < CDB_BUCKETS; i++)
360 | 		if (cdb_hash_free(cdb, &cdb->table1[i]) < 0)
361 | 			r = -1;
362 | 	(void)cdb_error(cdb, CDB_ERROR_E);
363 | 	(void)cdb->ops.allocator(cdb->ops.arena, cdb, 0, 0);
364 | 	return r;
365 | }
366 | 
367 | static inline int cdb_finalize(cdb_t *cdb) { /* write hash tables to disk */
368 | 	cdb_assert(cdb);
369 | 	cdb_assert(cdb->error == 0);
370 | 	cdb_assert(cdb->create == 1);
371 | 	if (CDB_WRITE_ON == 0)
372 | 		return cdb_error(cdb, CDB_ERROR_DISABLED_E);
373 | 	int r = 0;
374 | 	cdb_word_t mlen = 8;
375 | 	cdb_word_t *hashes    = cdb_allocate(cdb, mlen * sizeof *hashes);
376 | 	cdb_word_t *positions = cdb_allocate(cdb, mlen * sizeof *positions);
377 | 	if (!hashes || !positions)
378 | 		goto fail;
379 | 	/* NB. No need to seek as we are the only thing that can affect
380 | 	 * cdb->position in write mode */
381 | 	cdb->hash_start = cdb->position;
382 | 
383 | 	for (size_t i = 0; i < CDB_BUCKETS; i++) { /* write tables at end of file */
384 | 		cdb_hash_table_t *t = &cdb->table1[i];
385 | 		const cdb_word_t length = t->header.length * 2ul;
386 | 		t->header.position = cdb->position; /* needs to be set */
387 | 		if (length == 0)
388 | 			continue;
389 | 		if (cdb_bound_check(cdb, length < t->header.length) < 0)
390 | 			goto fail;
391 | 		if (mlen < length) {
392 | 			const cdb_word_t required = length * sizeof (cdb_word_t);
393 | 			if (cdb_overflow_check(cdb, required < length) < 0)
394 | 				goto fail;
395 | 			cdb_word_t *t1 = cdb_reallocate(cdb, hashes, required);
396 | 			if (!t1)
397 | 				goto fail;
398 | 			hashes = t1;
399 | 			cdb_word_t *t2 = cdb_reallocate(cdb, positions, required);
400 | 			if (!t2)
401 | 				goto fail;
402 | 			positions = t2;
403 | 			mlen = length;
404 | 		}
405 | 
406 | 		memset(hashes,    0, length * sizeof (cdb_word_t));
407 | 		memset(positions, 0, length * sizeof (cdb_word_t));
408 | 
409 | 		for (size_t j = 0; j < t->header.length; j++) {
410 | 			const cdb_word_t h = t->hashes[j];
411 | 			const cdb_word_t p = t->fps[j];
412 | 			const cdb_word_t start = (h >> CDB_NBUCKETS) % length;
413 | 			cdb_word_t k = 0;
414 | 			for (k = start; positions[k]; k = (k + 1ul) % length)
415 | 				;
416 | 			hashes[k]    = h;
417 | 			positions[k] = p;
418 | 		}
419 | 
420 | 		for (cdb_word_t j = 0; j < length; j++)
421 | 			if (cdb_write_word_pair(cdb, hashes[j], positions[j]) < 0)
422 | 				goto fail;
423 | 	}
424 | 	cdb->file_end = cdb->position;
425 | 	if (cdb_seek_internal(cdb, cdb->file_start) < 0)
426 | 		goto fail;
427 | 	for (size_t i = 0; i < CDB_BUCKETS; i++) { /* write initial hash table */
428 | 		const cdb_hash_table_t * const t = &cdb->table1[i];
429 | 		if (cdb_write_word_pair(cdb, t->header.position, (t->header.length * 2ul)) < 0)
430 | 			goto fail;
431 | 	}
432 | 	if (cdb_free(cdb, hashes) < 0)
433 | 		r = -1;
434 | 	if (cdb_free(cdb, positions) < 0)
435 | 		r = -1;
436 | 	return r == 0 && cdb->ops.flush ? cdb->ops.flush(cdb->file) : r;
437 | fail:
438 | 	(void)cdb_free(cdb, hashes);
439 | 	(void)cdb_free(cdb, positions);
440 | 	return cdb_error(cdb, CDB_ERROR_E);
441 | }
442 | 
443 | int cdb_close(cdb_t *cdb) { /* free cdb, close (and write to disk if in create mode) */
444 | 	if (!cdb)
445 | 		return 0;
446 | 	if (cdb->error)
447 | 		goto fail;
448 | 	if (cdb->create)
449 | 		if (cdb_finalize(cdb) < 0)
450 | 			goto fail;
451 | 	return cdb_free_resources(cdb);
452 | fail:
453 | 	(void)cdb_free_resources(cdb);
454 | 	return CDB_ERROR_E;
455 | }
456 | int cdb_open(cdb_t **cdb, const cdb_options_t *ops, const int create, const char *file) {
457 | 	/* We could allow the word size of the CDB database {16, 32 (default) or 64}
458 | 	 * to be configured at run time and not compile time, this has API related
459 | 	 * consequences, the size of 'cdb_word_t' would determine the maximum size that
460 | 	 * could be supported by this library. 'cdb_open' would have to take another
461 | 	 * parameter or one of the structures passed in would need to be extended. */
462 | 	cdb_assert(cdb);
463 | 	cdb_assert(ops);
464 | 	cdb_assert(ops->allocator);
465 | 	cdb_assert(ops->read);
466 | 	cdb_assert(ops->open);
467 | 	cdb_assert(ops->close);
468 | 	cdb_assert(ops->seek);
469 | 	cdb_implies(create, ops->write);
470 | 	CDB_BUILD_BUG_ON(CHAR_BIT != 8);
471 | 	/* ops->flush is optional */
472 | 	*cdb = NULL;
473 | 	if (create && CDB_WRITE_ON == 0)
474 | 		return CDB_ERROR_E;
475 | 	if (ops->size != 0 && ops->size != 16 && ops->size != 32 && ops->size != 64)
476 | 		return CDB_ERROR_SIZE_E;
477 | 	if (ops->size != 0 && ops->size > (sizeof(cdb_word_t) * CHAR_BIT))
478 | 		return CDB_ERROR_SIZE_E;
479 | 	cdb_t *c = NULL;
480 | 	const int large = CDB_MEMORY_INDEX_ON || create;
481 | 	const size_t csz = (sizeof *c) + (large * sizeof c->table1[0] * CDB_BUCKETS);
482 | 	c = ops->allocator(ops->arena, NULL, 0, csz);
483 | 	if (!c)
484 | 		goto fail;
485 | 	memset(c, 0, csz);
486 | 	c->ops         = *ops;
487 | 	const cdb_hash_fn hash_fn = c->ops.size >= 64 ? cdb_hash64 : cdb_hash;
488 | 	c->ops.size    = c->ops.size    ? c->ops.size / CHAR_BIT : (32ul / CHAR_BIT);
489 | 	c->ops.hash    = c->ops.hash    ? c->ops.hash    : hash_fn;
490 | 	c->ops.compare = c->ops.compare ? c->ops.compare : cdb_memory_compare;
491 | 	c->create      = create;
492 | 	c->empty       = 1;
493 | 	*cdb           = c;
494 | 	c->file_start  = CDB_FILE_START;
495 | 	c->file        = c->ops.open(file, create ? CDB_RW_MODE : CDB_RO_MODE);
496 | 	if (!(c->file)) {
497 | 		(void)cdb_error(c, CDB_ERROR_OPEN_E);
498 | 		goto fail;
499 | 	}
500 | 	if (cdb_seek_internal(c, c->file_start) < 0)
501 | 		goto fail;
502 | 	if (create) {
503 | 		for (size_t i = 0; i < CDB_BUCKETS; i++) /* write empty header */
504 | 			if (cdb_write_word_pair(c, 0, 0) < 0)
505 | 				goto fail;
506 | 	} else {
507 | 		/* We allocate more memory than we need if CDB_MEMORY_INDEX_ON is
508 | 		 * true as 'cdb_hash_table_t' contains entries needed for
509 | 		 * creation that we do not need when reading the database. */
510 | 		cdb_word_t hpos = 0, hlen = 0, lpos = -1l, lset = 0, prev = 0, pnum = 0;
511 | 		for (size_t i = 0; i < CDB_BUCKETS; i++) {
512 | 			cdb_hash_table_t t = { .header = { .position = 0, .length = 0 } };
513 | 			if (cdb_read_word_pair(c, &t.header.position, &t.header.length) < 0)
514 | 				goto fail;
515 | 			if (i && t.header.position != (prev + (pnum * (2ul * cdb_get_size(c)))))
516 | 				goto fail;
517 | 			prev = t.header.position;
518 | 			pnum = t.header.length;
519 | 			if (CDB_MEMORY_INDEX_ON)
520 | 				c->table1[i] = t;
521 | 			if (t.header.length)
522 | 				c->empty = 0;
523 | 			if (t.header.length && t.header.position < lpos) {
524 | 				lpos = t.header.position;
525 | 				lset = 1;
526 | 			}
527 | 			if (t.header.position > hpos) {
528 | 				hpos = t.header.position;
529 | 				hlen = t.header.length;
530 | 			}
531 | 		}
532 | 		if (cdb_seek_internal(c, c->file_start) < 0)
533 | 			goto fail;
534 | 		c->file_end   = hpos + (hlen * (2ul * cdb_get_size(c)));
535 | 		c->hash_start = lset ? lpos : (CDB_BUCKETS * (2ul * cdb_get_size(c)));
536 | 		if (lset) {
537 | 			if (cdb_bound_check(c, c->file_start > lpos) < 0)
538 | 				goto fail;
539 | 		}
540 | 		if (cdb_overflow_check(c, c->file_end < hpos) < 0)
541 | 			goto fail;
542 | 	}
543 | 	c->opened = 1;
544 | 	return CDB_OK_E;
545 | fail:
546 | 	(void)cdb_close(c);
547 | 	return CDB_ERROR_E;
548 | }
549 | 
550 | /* returns: -1 = error, 0 = not equal, 1 = equal */
551 | static int cdb_compare(cdb_t *cdb, const cdb_buffer_t *k1, const cdb_file_pos_t *k2) {
552 | 	cdb_assert(cdb);
553 | 	cdb_assert(cdb->ops.compare);
554 | 	cdb_assert(k1);
555 | 	cdb_assert(k2);
556 | 	if (k1->length != k2->length)
557 | 		return CDB_NOT_FOUND_E; /* not equal */
558 | 	const cdb_word_t length = k1->length;
559 | 	if (cdb_seek_internal(cdb, k2->position) < 0)
560 | 		return CDB_ERROR_E;
561 | 	for (cdb_word_t i = 0; i < length; i += CDB_READ_BUFFER_LENGTH) {
562 | 		/* Note that making this buffer larger may not make things faster - if
563 | 		 * most keys differ in the first few bytes then a smaller buffer means
564 | 		 * less bytes moved around before the comparison. */
565 | 		uint8_t kbuf[CDB_READ_BUFFER_LENGTH];
566 | 		CDB_BUILD_BUG_ON(sizeof kbuf != CDB_READ_BUFFER_LENGTH);
567 | 		const cdb_word_t rl = CDB_MIN((cdb_word_t)sizeof kbuf, (cdb_word_t)length - i);
568 | 		if (cdb_read_internal(cdb, kbuf, rl) != rl)
569 | 			return CDB_ERROR_E;
570 | 		if (cdb->ops.compare(k1->buffer + i, kbuf, rl))
571 | 			return CDB_NOT_FOUND_E;
572 | 	}
573 | 	return CDB_FOUND_E; /* equal */
574 | }
575 | 
576 | static int cdb_retrieve(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t *record) {
577 | 	cdb_assert(cdb);
578 | 	cdb_assert(cdb->opened);
579 | 	cdb_assert(cdb->ops.hash);
580 | 	cdb_assert(key); /* If key was NULL, we *could* lookup the values instead using cdb_foreach */
581 | 	cdb_assert(value);
582 | 	cdb_assert(record);
583 | 	cdb_word_t pos = 0, num = 0, h = 0;
584 | 	uint64_t wanted = *record, recno = 0;
585 | 	*record = 0;
586 | 	*value = (cdb_file_pos_t) { 0, 0, };
587 | 	if (cdb->error)
588 | 		goto fail;
589 | 	if (cdb->create) {
590 | 		(void)cdb_error(cdb, CDB_ERROR_MODE_E);
591 | 		goto fail;
592 | 	}
593 | 	/* It is usually a good idea to include the length as part of the data
594 | 	 * of the hash, however that would make the format incompatible. */
595 | 	h = cdb->ops.hash((uint8_t *)(key->buffer), key->length) & cdb_get_mask(cdb); /* locate key in first table */
596 | 	if (CDB_MEMORY_INDEX_ON) { /* use more memory (~4KiB) to speed up first match */
597 | 		cdb_hash_table_t *t = &cdb->table1[h % CDB_BUCKETS];
598 | 		pos = t->header.position;
599 | 		num = t->header.length;
600 | 	} else {
601 | 		if (cdb_seek_internal(cdb, cdb->file_start + ((h % CDB_BUCKETS) * (2ul * cdb_get_size(cdb)))) < 0)
602 | 			goto fail;
603 | 		if (cdb_read_word_pair(cdb, &pos, &num) < 0)
604 | 			goto fail;
605 | 	}
606 | 	if (num == 0) /* no keys in this bucket -> key not found */
607 | 		return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E;
608 | 	if (cdb_bound_check(cdb, pos > cdb->file_end || pos < cdb->hash_start) < 0)
609 | 		goto fail;
610 | 	const cdb_word_t start = (h >> CDB_NBUCKETS) % num;
611 | 	for (cdb_word_t i = 0; i < num; i++) {
612 | 		const cdb_word_t seekpos = pos + (((start + i) % num) * (2ul * cdb_get_size(cdb)));
613 | 		if (seekpos < pos || seekpos > cdb->file_end)
614 | 			goto fail;
615 | 		if (cdb_seek_internal(cdb, seekpos) < 0)
616 | 			goto fail;
617 | 		cdb_word_t h1 = 0, p1 = 0;
618 | 		if (cdb_read_word_pair(cdb, &h1, &p1) < 0)
619 | 			goto fail;
620 | 		if (cdb_bound_check(cdb, p1 > cdb->hash_start) < 0) /* key-value pair should not overlap with hash tables section */
621 | 			goto fail;
622 | 		if (p1 == 0) { /* end of list */
623 | 			*record         = recno;
624 | 			return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E;
625 | 		}
626 | 		if (cdb_hash_check(cdb, (h1 & 0xFFul) != (h & 0xFFul)) < 0) /* buckets bits should be the same */
627 | 			goto fail;
628 | 		if (h1 == h) { /* possible match */
629 | 			if (cdb_seek_internal(cdb, p1) < 0)
630 | 				goto fail;
631 | 			cdb_word_t klen = 0, vlen = 0;
632 | 			if (cdb_read_word_pair(cdb, &klen, &vlen) < 0)
633 | 				goto fail;
634 | 			const cdb_file_pos_t k2 = { .length = klen, .position = p1 + (2ul * cdb_get_size(cdb)) };
635 | 			if (cdb_overflow_check(cdb, k2.position < p1 || (k2.position + klen) < k2.position) < 0)
636 | 				goto fail;
637 | 			if (cdb_bound_check(cdb, k2.position + klen > cdb->hash_start) < 0)
638 | 				goto fail;
639 | 			const int comp = cdb_compare(cdb, key, &k2);
640 | 			const int found = comp > 0;
641 | 			if (comp < 0)
642 | 				goto fail;
643 | 			if (found && recno == wanted) { /* found key, correct record? */
644 | 				cdb_file_pos_t v2 = { .length = vlen, .position = k2.position + klen };
645 | 				if (cdb_overflow_check(cdb, (v2.position + v2.length) < v2.position) < 0)
646 | 					goto fail;
647 | 				if (cdb_bound_check(cdb, v2.position > cdb->hash_start) < 0)
648 | 					goto fail;
649 | 				if (cdb_bound_check(cdb, (v2.position + v2.length) > cdb->hash_start) < 0)
650 | 					goto fail;
651 | 				*value          = v2;
652 | 				*record         = recno;
653 | 				return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_FOUND_E;
654 | 			}
655 | 			recno += found;
656 | 		}
657 | 	}
658 | 	*record         = recno;
659 | 	return cdb_failure(cdb) < 0 ? CDB_ERROR_E : CDB_NOT_FOUND_E;
660 | fail:
661 | 	return cdb_error(cdb, CDB_ERROR_E);
662 | }
663 | 
664 | int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t record) {
665 | 	cdb_assert(cdb);
666 | 	cdb_assert(cdb->opened);
667 | 	cdb_assert(key);
668 | 	cdb_assert(value);
669 | 	return cdb_retrieve(cdb, key, value, &record);
670 | }
671 | 
672 | int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value) {
673 | 	cdb_assert(cdb);
674 | 	cdb_assert(cdb->opened);
675 | 	cdb_assert(key);
676 | 	cdb_assert(value);
677 | 	return cdb_lookup(cdb, key, value, 0l);
678 | }
679 | 
680 | int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, uint64_t *count) {
681 | 	cdb_assert(cdb);
682 | 	cdb_assert(cdb->opened);
683 | 	cdb_assert(key);
684 | 	cdb_assert(count);
685 | 	cdb_file_pos_t value = { 0, 0, };
686 | 	uint64_t c = UINT64_MAX;
687 | 	const int r = cdb_retrieve(cdb, key, &value, &c);
688 | 	c = r == CDB_FOUND_E ? c + 1l : c;
689 | 	*count = c;
690 | 	return r;
691 | }
692 | 
693 | int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param) {
694 | 	cdb_assert(cdb);
695 | 	cdb_assert(cdb->opened);
696 | 	if (cdb->error || cdb->create)
697 | 		goto fail;
698 | 	cdb_word_t pos = cdb->file_start + (256ul * (2ul * cdb_get_size(cdb)));
699 | 	int r = 0;
700 | 	for (;pos < cdb->hash_start;) {
701 | 		if (cdb_seek_internal(cdb, pos) < 0)
702 | 			goto fail;
703 | 		cdb_word_t klen = 0, vlen = 0;
704 | 		if (cdb_read_word_pair(cdb, &klen, &vlen) < 0)
705 | 			goto fail;
706 | 		const cdb_file_pos_t key   = { .length = klen, .position = pos + (2ul * cdb_get_size(cdb)), };
707 | 		const cdb_file_pos_t value = { .length = vlen, .position = pos + (2ul * cdb_get_size(cdb)) + klen, };
708 | 		if (cdb_bound_check(cdb, value.position > cdb->hash_start) < 0)
709 | 			goto fail;
710 | 		if (cdb_bound_check(cdb, (value.position + value.length) > cdb->hash_start) < 0)
711 | 			goto fail;
712 | 		r = cb ? cb(cdb, &key, &value, param) : 0;
713 | 		if (r < 0)
714 | 			goto fail;
715 | 		if (r > 0) /* early termination */
716 | 			break;
717 | 		pos = value.position + value.length;
718 | 	}
719 | 	return cdb_failure(cdb) < 0 ? CDB_ERROR_E : r;
720 | fail:
721 | 	return cdb_error(cdb, CDB_ERROR_E);
722 | }
723 | 
724 | static int cdb_round_up_to_next_power_of_two(const cdb_word_t x) {
725 | 	cdb_word_t p = 1ul;
726 | 	while (p < x)
727 | 		p <<= 1ul;
728 | 	return p;
729 | }
730 | 
731 | static int cdb_hash_grow(cdb_t *cdb, const cdb_word_t hash, const cdb_word_t position) {
732 | 	cdb_assert(cdb);
733 | 	cdb_hash_table_t *t1 = &cdb->table1[hash % CDB_BUCKETS];
734 | 	cdb_word_t *hashes = t1->hashes, *fps = t1->fps;
735 | 	const cdb_word_t next = cdb_round_up_to_next_power_of_two(t1->header.length + 1ul);
736 | 	const cdb_word_t cur  = cdb_round_up_to_next_power_of_two(t1->header.length);
737 | 	if (cdb_overflow_check(cdb, (t1->header.length + 1ul) < t1->header.length) < 0)
738 | 		return CDB_ERROR_E;
739 | 	if (next > cur || t1->header.length == 0) {
740 | 		const cdb_word_t alloc = t1->header.length == 0 ? 1ul : t1->header.length * 2ul;
741 | 		if (cdb_overflow_check(cdb, (t1->header.length * 2ul) < t1->header.length) < 0)
742 | 			return CDB_ERROR_E;
743 | 		if (!(hashes = cdb_reallocate(cdb, t1->hashes, alloc * sizeof (*t1->hashes))))
744 | 			return CDB_ERROR_E;
745 | 		t1->hashes = hashes;
746 | 		if (!(fps = cdb_reallocate(cdb, t1->fps, alloc * sizeof (*t1->fps)))) {
747 | 			(void)cdb_hash_free(cdb, t1);
748 | 			return CDB_ERROR_E;
749 | 		}
750 | 	}
751 | 	t1->hashes = hashes;
752 | 	t1->fps    = fps;
753 | 	t1->hashes[t1->header.length] = hash;
754 | 	t1->fps[t1->header.length]    = position;
755 | 	t1->header.length++;
756 | 	return cdb_failure(cdb);
757 | }
758 | 
759 | /* Duplicate keys can be added. To prevent this the library could easily be
760 |  * improved in a backwards compatible way by extending the options structure
761 |  * to include a new options value that would specify if adding duplicate keys
762 |  * is allowed (adding values to the end of a structure being backwards
763 |  * compatible in (most/all?) C ABIs). "cdb_add" would then need to be extended
764 |  * to check for duplicate keys, which would be the difficult bit, a new lookup
765 |  * function would need to be designed that could query the partially written
766 |  * database. */
767 | int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value) {
768 | 	cdb_preconditions(cdb);
769 | 	cdb_assert(cdb->opened);
770 | 	cdb_assert(cdb->ops.hash);
771 | 	cdb_assert(key);
772 | 	cdb_assert(value);
773 | 	cdb_assert(cdb->position >= cdb->file_start);
774 | 	if (CDB_WRITE_ON == 0)
775 | 		return cdb_error(cdb, CDB_ERROR_DISABLED_E);
776 | 	if (cdb->error)
777 | 		goto fail;
778 | 	if (cdb->create == 0) {
779 | 		(void)cdb_error(cdb, CDB_ERROR_MODE_E);
780 | 		goto fail;
781 | 	}
782 | 	if (cdb_overflow_check(cdb, (key->length + value->length) < key->length) < 0)
783 | 		goto fail;
784 | 	const cdb_word_t h = cdb->ops.hash((uint8_t*)(key->buffer), key->length) & cdb_get_mask(cdb);
785 | 		if (cdb_hash_grow(cdb, h, cdb->position) < 0)
786 | 		goto fail;
787 | 	if (cdb_seek_internal(cdb, cdb->position) < 0)
788 | 		goto fail;
789 | 	if (cdb_write_word_pair(cdb, key->length, value->length) < 0)
790 | 		goto fail;
791 | 	if (cdb_write(cdb, key->buffer, key->length) != key->length)
792 | 		goto fail;
793 | 	if (cdb_write(cdb, value->buffer, value->length) != value->length)
794 | 		goto fail;
795 | 	cdb->empty = 0;
796 | 	return cdb_failure(cdb);
797 | fail:
798 | 	return cdb_error(cdb, CDB_ERROR_E);
799 | }
800 | 
801 | uint64_t cdb_prng(uint64_t s[2]) { /* XORSHIFT128: A few rounds of SPECK or TEA ciphers also make good PRNGs */
802 | 	cdb_assert(s);
803 | 	if (!s[0] && !s[1])
804 | 		s[0] = 1;
805 | 	uint64_t a = s[0];
806 | 	const uint64_t b = s[1];
807 | 	s[0] = b;
808 | 	a ^= a << 23;
809 | 	a ^= a >> 18;
810 | 	a ^= b;
811 | 	a ^= b >>  5;
812 | 	s[1] = a;
813 | 	return a + b;
814 | }
815 | 
816 | #define CDB_TEST_VECTOR_LEN (1024ul)
817 | 
818 | /* A series of optional unit tests that can be compiled out
819 |  * of the program, the function will still remain even if the
820 |  * contents of it are elided. */
821 | int cdb_tests(const cdb_options_t *ops, const char *test_file) {
822 | 	cdb_assert(ops);
823 | 	cdb_assert(test_file);
824 | 	CDB_BUILD_BUG_ON(sizeof (cdb_word_t) < 2);
825 | 
826 | 	/* See readme.md for description of this and why this
827 | 	 * is the way it is. Note that if "CDB_TESTS_ON" is
828 | 	 * zero the rest of the code will be removed by the
829 | 	 * compiler though. */
830 | 	if (CDB_TESTS_ON == 0) 
831 | 		return CDB_OK_E;
832 | 
833 | 	const size_t l = ops->size;
834 | 	const size_t vectors = l == 16ul ? 128ul : CDB_TEST_VECTOR_LEN;
835 | 	const size_t klen    = l == 16ul ?  64ul : CDB_TEST_VECTOR_LEN;
836 | 	const size_t vlen    = l == 16ul ?  64ul : CDB_TEST_VECTOR_LEN;
837 | 
838 | 	typedef struct {
839 | 		char key[CDB_TEST_VECTOR_LEN], value[CDB_TEST_VECTOR_LEN], result[CDB_TEST_VECTOR_LEN];
840 | 		uint64_t recno;
841 | 		cdb_word_t klen, vlen;
842 | 	} test_t;
843 | 
844 | 	typedef struct { char *key, *value; } test_duplicate_t;
845 | 
846 | 	static const test_duplicate_t dups[] = { /* add known duplicates */
847 | 		{ "ALPHA",    "BRAVO",     },
848 | 		{ "ALPHA",    "CHARLIE",   },
849 | 		{ "ALPHA",    "DELTA",     },
850 | 		{ "FSF",      "Collide-1", },
851 | 		{ "Aug",      "Collide-2", },
852 | 		{ "FSF",      "Collide-3", },
853 | 		{ "Aug",      "Collide-4", },
854 | 		{ "revolves", "Collide-1", },
855 | 		{ "revolt's", "Collide-2", },
856 | 		{ "revolt's", "Collide-3", },
857 | 		{ "revolt's", "Collide-4", },
858 | 		{ "revolves", "Collide-5", },
859 | 		{ "revolves", "Collide-6", },
860 | 		{ "1234",     "5678",      },
861 | 		{ "1234",     "9ABC",      },
862 | 		{ "",         "",          },
863 | 		{ "",         "X",         },
864 | 		{ "",         "",          },
865 | 	};
866 | 	const size_t dupcnt = sizeof (dups) / sizeof (dups[0]);
867 | 
868 | 	cdb_t *cdb = NULL;
869 | 	test_t *ts = NULL;
870 | 	uint64_t s[2] = { 0, };
871 | 	int r = CDB_OK_E;
872 | 
873 | 	if (cdb_open(&cdb, ops, 1, test_file) < 0)
874 | 		return CDB_ERROR_E;
875 | 
876 | 	if (!(ts = cdb_allocate(cdb, (dupcnt + vectors) * (sizeof *ts))))
877 | 		goto fail;
878 | 
879 | 	for (unsigned i = 0; i < vectors; i++) {
880 | 		char *k = ts[i].key;
881 | 		char *v = ts[i].value;
882 | 		const cdb_word_t kl = (cdb_prng(s) % (klen - 1ul)) + 1ul;
883 | 		const cdb_word_t vl = (cdb_prng(s) % (vlen - 1ul)) + 1ul;
884 | 		for (unsigned long j = 0; j < kl; j++) 
885 | 			k[j] = 'a' + (cdb_prng(s) % 26); /* this is biased, so what, fight me */
886 | 		for (unsigned long j = 0; j < vl; j++)
887 | 			v[j] = 'a' + (cdb_prng(s) % 26);
888 | 		const cdb_buffer_t key   = { .length = kl, .buffer = k };
889 | 	       	const cdb_buffer_t value = { .length = vl, .buffer = v };
890 | 		for (unsigned long j = 0; j < i; j++)
891 | 			if (memcmp(ts[i].value, ts[j].value, vlen) == 0)
892 | 				ts[i].recno++;
893 | 		if (cdb_add(cdb, &key, &value) < 0)
894 | 			goto fail;
895 | 		ts[i].klen = kl;
896 | 		ts[i].vlen = vl;
897 | 	}
898 | 
899 | 	for (size_t i = 0; i < dupcnt; i++) {
900 | 		test_duplicate_t d = dups[i];
901 | 		const cdb_buffer_t key   = { .length = strlen(d.key),   .buffer = d.key };
902 | 		const cdb_buffer_t value = { .length = strlen(d.value), .buffer = d.value };
903 | 
904 | 		memcpy(ts[i + vectors].key,   key.buffer,   key.length);
905 | 		memcpy(ts[i + vectors].value, value.buffer, value.length);
906 | 
907 | 		for (unsigned long j = 0; j < i; j++)
908 | 			if (memcmp(ts[i].value, ts[j].value, vlen) == 0)
909 | 				ts[i].recno++;
910 | 
911 | 		if (cdb_add(cdb, &key, &value) < 0)
912 | 			goto fail;
913 | 	}
914 | 
915 | 
916 | 	if (cdb_close(cdb) < 0) {
917 | 		(void)ops->allocator(ops->arena, ts, 0, 0);
918 | 		return -1;
919 | 	}
920 | 	cdb = NULL;
921 | 
922 | 	if (cdb_open(&cdb, ops, 0, test_file) < 0) {
923 | 		(void)ops->allocator(ops->arena, ts, 0, 0);
924 | 		return -1;
925 | 	}
926 | 
927 | 	for (unsigned i = 0; i < (vectors + dupcnt); i++) {
928 | 		test_t *t = &ts[i];
929 | 		const cdb_buffer_t key = { .length = t->klen, .buffer = t->key };
930 | 		cdb_file_pos_t result = { 0, 0 }, discard = { 0, 0 };
931 | 		const int g = cdb_lookup(cdb, &key, &result, t->recno);
932 | 		if (g < 0)
933 | 			goto fail;
934 | 		if (g == CDB_NOT_FOUND_E) {
935 | 			r = -3; /* -2 not used */
936 | 			continue;
937 | 		}
938 | 
939 | 		const int d = cdb_get(cdb, &key, &discard);
940 | 		if (d < 0)
941 | 			goto fail;
942 | 		if (d == CDB_NOT_FOUND_E)
943 | 			r = -4;
944 | 
945 | 		if (result.length > vlen)
946 | 			goto fail;
947 | 		if (result.length != t->vlen) {
948 | 			r = -5;
949 | 		} else {
950 | 			if (cdb_seek_internal(cdb, result.position) < 0)
951 | 				goto fail;
952 | 			if (cdb_read_internal(cdb, t->result, result.length) != result.length)
953 | 				goto fail;
954 | 			if (memcmp(t->result, t->value, result.length))
955 | 				r = -6;
956 | 		}
957 | 
958 | 		uint64_t cnt = 0;
959 | 		if (cdb_count(cdb, &key, &cnt) < 0)
960 | 			goto fail;
961 | 		if (cnt < t->recno)
962 | 			r = -7;
963 | 	}
964 | 
965 | 	if (cdb_free(cdb, ts) < 0)
966 | 		r = -1;
967 | 	if (cdb_close(cdb) < 0)
968 | 		r = -1;
969 | 	return r;
970 | fail:
971 | 	(void)ops->allocator(ops->arena, ts, 0, 0);
972 | 	(void)cdb_close(cdb);
973 | 	return CDB_ERROR_E;
974 | }
975 | 
976 | 


--------------------------------------------------------------------------------
/cdb.h:
--------------------------------------------------------------------------------
 1 | /* Consult the "readme.md" file in the repository for a detailed
 2 |  * description of the API and the internals. */
 3 | #ifndef CDB_H
 4 | #define CDB_H
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | #define CDB_PROJECT "Constant Database"
10 | #define CDB_AUTHOR  "Richard James Howe"
11 | #define CDB_EMAIL   "howe.r.j.89@gmail.com"
12 | #define CDB_LICENSE "The Unlicense"
13 | #define CDB_REPO    "https://github.com/howerj/cdb"
14 | 
15 | #include <stddef.h>
16 | #include <stdint.h>
17 | 
18 | #ifndef CDB_API
19 | #define CDB_API /* Used to apply attributes to exported functions */
20 | #endif
21 | 
22 | #ifndef CDB_WORD_T
23 | typedef uint64_t cdb_word_t; /* valid sizes: uint64_t, uint32_t, uint16_t */
24 | #endif
25 | 
26 | struct cdb;
27 | typedef struct cdb cdb_t;
28 | 
29 | enum { CDB_RO_MODE, CDB_RW_MODE, }; /* passed to "open" in the "mode" option */
30 | 
31 | typedef struct {
32 | 	void *(*allocator)(void *arena, void *ptr, size_t oldsz, size_t newsz);
33 | 	cdb_word_t (*hash)(const uint8_t *data, size_t length); /* hash function: NULL defaults to djb hash */
34 | 	int (*compare)(const void *a, const void *b, size_t length); /* key comparison function: NULL defaults to memcmp */
35 | 	cdb_word_t (*read)(void *file, void *buf, size_t length); /* always needed, read from a resource */
36 | 	cdb_word_t (*write)(void *file, void *buf, size_t length); /* (conditionally optional) needed for db creation only, write to a resource */
37 | 	int (*seek)(void *file, uint64_t offset); /* "tell" is not needed as we keep track of the file position internally */
38 | 	void *(*open)(const char *name, int mode); /* open up a resource, which may or may not be a file, for reading (mode = CDB_RO_MODE) or read/write (mode = CDB_RW_MODE) */
39 | 	int (*close)(void *file); /* close a resource opened up with "open" */
40 | 	int (*flush)(void *file); /* (optional) called at end of successful creation */
41 | 
42 | 	void *arena;       /* used for 'arena' argument for the allocator, can be NULL if allocator allows it */
43 | 	cdb_word_t offset; /* starting offset for CDB file if not at beginning of file */
44 | 	unsigned size;     /* Either 0 (defaults 32), 16, 32 or 64, but cannot be bigger than 'sizeof(cdb_word_t)*8' in any case */
45 | } cdb_options_t; /* a file abstraction layer, could point to memory, flash, or disk */
46 | 
47 | typedef struct {
48 | 	cdb_word_t length; /* length of data */
49 | 	char *buffer;      /* pointer to arbitrary data */
50 | } cdb_buffer_t; /* used to represent a key or value in memory */
51 | 
52 | typedef struct {
53 | 	cdb_word_t position; /* position in file, for use with cdb_read/cdb_seek */
54 | 	cdb_word_t length;   /* length of data on disk, for use with cdb_read */
55 | } cdb_file_pos_t; /* used to represent a value on disk that can be accessed via 'cdb_options_t' */
56 | 
57 | typedef int (*cdb_callback)(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param);
58 | 
59 | /* All functions return: < 0 on failure, 0 on success/not found, 1 on found if applicable */
60 | CDB_API int cdb_open(cdb_t **cdb, const cdb_options_t *ops, int create, const char *file); /* arena may be NULL, allocator must be present */
61 | CDB_API int cdb_close(cdb_t *cdb);  /* free cdb, close handles (and write to disk if in create mode) */
62 | CDB_API int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length); /* Returns error code not length! Not being able to read "length" bytes is an error! */
63 | CDB_API int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value); /* do not call cdb_read and/or cdb_seek in open mode */
64 | CDB_API int cdb_seek(cdb_t *cdb, cdb_word_t position);
65 | CDB_API int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param);
66 | CDB_API int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2);
67 | CDB_API int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value);
68 | CDB_API int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, uint64_t record);
69 | CDB_API int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, uint64_t *count);
70 | CDB_API int cdb_status(cdb_t *cdb); /* returns CDB error status */
71 | CDB_API int cdb_version(unsigned long *version); /* version number in x.y.z format, z = LSB, MSB is library info */
72 | CDB_API int cdb_tests(const cdb_options_t *ops, const char *test_file);
73 | 
74 | CDB_API uint64_t cdb_prng(uint64_t s[2]); /* "s" is PRNG state, you can set it to any value you like to seed */
75 | CDB_API cdb_word_t cdb_hash(const uint8_t *data, size_t length); /* hash used by original CDB program */
76 | 
77 | #ifdef __cplusplus
78 | }
79 | #endif
80 | #endif
81 | 


--------------------------------------------------------------------------------
/host.c:
--------------------------------------------------------------------------------
 1 | #include "cdb.h"
 2 | #include "host.h"
 3 | 
 4 | #include <assert.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | 
 8 | #define UNUSED(X) ((void)(X))
 9 | 
10 | typedef struct {
11 | 	FILE *handle;
12 | 	size_t length;
13 | 	char buffer[];
14 | } file_t;
15 | 
16 | static void *cdb_allocator_cb(void *arena, void *ptr, const size_t oldsz, const size_t newsz) {
17 | 	UNUSED(arena);
18 | 	if (newsz == 0) {
19 | 		free(ptr);
20 | 		return NULL;
21 | 	}
22 | 	if (newsz > oldsz)
23 | 		return realloc(ptr, newsz);
24 | 	return ptr;
25 | }
26 | 
27 | static cdb_word_t cdb_read_cb(void *file, void *buf, size_t length) {
28 | 	assert(file);
29 | 	assert(buf);
30 | 	assert(((file_t*)file)->handle);
31 | 	return fread(buf, 1, length, ((file_t*)file)->handle);
32 | }
33 | 
34 | static cdb_word_t cdb_write_cb(void *file, void *buf, size_t length) {
35 | 	assert(file);
36 | 	assert(buf);
37 | 	assert(((file_t*)file)->handle);
38 | 	return fwrite(buf, 1, length, ((file_t*)file)->handle);
39 | }
40 | 
41 | static int cdb_seek_cb(void *file, uint64_t offset) {
42 | 	assert(file);
43 | 	assert(((file_t*)file)->handle);
44 | 	return fseek(((file_t*)file)->handle, offset, SEEK_SET);
45 | }
46 | 
47 | static void *cdb_open_cb(const char *name, int mode) {
48 | 	assert(name);
49 | 	assert(mode == CDB_RO_MODE || mode == CDB_RW_MODE);
50 | 	const char *mode_string = mode == CDB_RW_MODE ? "wb+" : "rb";
51 | 	FILE *f = fopen(name, mode_string);
52 | 	if (!f)
53 | 		return f;
54 | 	const size_t length = 1024ul * 16ul;
55 | 	file_t *fb = malloc(sizeof (*f) + length);
56 | 	if (!fb) {
57 | 		fclose(f);
58 | 		return NULL;
59 | 	}
60 | 	fb->handle = f;
61 | 	fb->length = length;
62 | 	if (setvbuf(f, fb->buffer, _IOFBF, fb->length) < 0) {
63 | 		fclose(f);
64 | 		free(fb);
65 | 		return NULL;
66 | 	}
67 | 	return fb;
68 | }
69 | 
70 | static int cdb_close_cb(void *file) {
71 | 	assert(file);
72 | 	assert(((file_t*)file)->handle);
73 | 	const int r = fclose(((file_t*)file)->handle);
74 | 	((file_t*)file)->handle = NULL;
75 | 	free(file);
76 | 	return r;
77 | }
78 | 
79 | static int cdb_flush_cb(void *file) {
80 | 	assert(file);
81 | 	return fflush(((file_t*)file)->handle);
82 | }
83 | 
84 | const cdb_options_t cdb_host_options = {
85 | 	.allocator = cdb_allocator_cb,
86 | 	.hash      = NULL,
87 | 	.compare   = NULL,
88 | 	.read      = cdb_read_cb,
89 | 	.write     = cdb_write_cb,
90 | 	.seek      = cdb_seek_cb,
91 | 	.open      = cdb_open_cb,
92 | 	.close     = cdb_close_cb,
93 | 	.flush     = cdb_flush_cb,
94 | 	.arena     = NULL,
95 | 	.offset    = 0,
96 | 	.size      = 0, /* auto-select */
97 | };
98 | 
99 | 


--------------------------------------------------------------------------------
/host.h:
--------------------------------------------------------------------------------
1 | #ifndef CDB_HOST_H
2 | #define CDB_HOST_H
3 | 
4 | #include "cdb.h"
5 | 
6 | extern const cdb_options_t cdb_host_options;
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/main.c:
--------------------------------------------------------------------------------
  1 | /* Program: Constant Database Driver
  2 |  * Author:  Richard James Howe
  3 |  * Email:   howe.r.j.89@gmail.com
  4 |  * License: Unlicense
  5 |  * Repo:    <https://github.com/howerj/cdb> */
  6 | 
  7 | #include "cdb.h"
  8 | #include "host.h"
  9 | #include <assert.h>
 10 | #include <ctype.h>
 11 | #include <errno.h>
 12 | #include <limits.h>
 13 | #include <stdarg.h>
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | 
 18 | #define UNUSED(X)      ((void)(X))
 19 | #define MIN(X, Y)      ((X) < (Y) ? (X) : (Y))
 20 | #define MAX(X, Y)      ((X) > (Y) ? (X) : (Y))
 21 | #define IO_BUFFER_SIZE (1024u)
 22 | #define DISTMAX        (10ul)
 23 | 
 24 | #ifdef _WIN32 /* Used to unfuck file mode for "Win"dows. Text mode is for losers. */
 25 | #include <windows.h>
 26 | #include <io.h>
 27 | #include <fcntl.h>
 28 | static void binary(FILE *f) { _setmode(_fileno(f), _O_BINARY); } /* only platform specific code... */
 29 | #else
 30 | static inline void binary(FILE *f) { UNUSED(f); }
 31 | #endif
 32 | 
 33 | typedef struct {
 34 | 	unsigned long records;
 35 | 	unsigned long total_key_length, total_value_length;
 36 | 	unsigned long min_key_length, min_value_length;
 37 | 	unsigned long max_key_length, max_value_length;
 38 | 	unsigned long hash_start;
 39 | } cdb_statistics_t;
 40 | 
 41 | typedef struct {
 42 | 	char *arg;   /* parsed argument */
 43 | 	int error,   /* turn error reporting on/off */
 44 | 	    index,   /* index into argument list */
 45 | 	    option,  /* parsed option */
 46 | 	    reset;   /* set to reset */
 47 | 	char *place; /* internal use: scanner position */
 48 | 	int  init;   /* internal use: initialized or not */
 49 | } cdb_getopt_t;      /* getopt clone; with a few modifications */
 50 | 
 51 | static unsigned verbose = 0;
 52 | 
 53 | static void info(const char *fmt, ...) {
 54 | 	assert(fmt);
 55 | 	if (verbose == 0)
 56 | 		return;
 57 | 	FILE *out = stderr;
 58 | 	va_list ap;
 59 | 	va_start(ap, fmt);
 60 | 	(void)vfprintf(out, fmt, ap);
 61 | 	va_end(ap);
 62 | 	(void)fputc('\n', out);
 63 | 	(void)fflush(out);
 64 | }
 65 | 
 66 | static void die(const char *fmt, ...) {
 67 | 	assert(fmt);
 68 | 	FILE *out = stderr;
 69 | 	va_list ap;
 70 | 	va_start(ap, fmt);
 71 | 	(void)vfprintf(out, fmt, ap);
 72 | 	va_end(ap);
 73 | 	(void)fputc('\n', out);
 74 | 	(void)fflush(out);
 75 | 	exit(EXIT_FAILURE);
 76 | }
 77 | 
 78 | /* Adapted from: <https://stackoverflow.com/questions/10404448>, this
 79 |  * could be extended to parse out numeric values, and do other things, but
 80 |  * that is not needed here. The function and structure should be turned
 81 |  * into a header only library. */
 82 | static int cdb_getopt(cdb_getopt_t *opt, const int argc, char *const argv[], const char *fmt) {
 83 | 	assert(opt);
 84 | 	assert(fmt);
 85 | 	assert(argv);
 86 | 	enum { BADARG_E = ':', BADCH_E = '?', BADIO_E = '!', };
 87 | 
 88 | 	if (!(opt->init)) {
 89 | 		opt->place = ""; /* option letter processing */
 90 | 		opt->init  = 1;
 91 | 		opt->index = 1;
 92 | 	}
 93 | 
 94 | 	if (opt->reset || !*opt->place) { /* update scanning pointer */
 95 | 		opt->reset = 0;
 96 | 		if (opt->index >= argc || *(opt->place = argv[opt->index]) != '-') {
 97 | 			opt->place = "";
 98 | 			return -1;
 99 | 		}
100 | 		if (opt->place[1] && *++opt->place == '-') { /* found "--" */
101 | 			opt->index++;
102 | 			opt->place = "";
103 | 			return -1;
104 | 		}
105 | 	}
106 | 
107 | 	const char *oli = NULL; /* option letter list index */
108 | 	if ((opt->option = *opt->place++) == ':' || !(oli = strchr(fmt, opt->option))) { /* option letter okay? */
109 | 		 /* if the user didn't specify '-' as an option, assume it means -1.  */
110 | 		if (opt->option == '-')
111 | 			return -1;
112 | 		if (!*opt->place)
113 | 			opt->index++;
114 | 		if (opt->error && *fmt != ':')
115 | 			if (fprintf(stderr, "illegal option -- %c\n", opt->option) < 0)
116 | 				return BADIO_E;
117 | 		return BADCH_E;
118 | 	}
119 | 
120 | 	if (*++oli != ':') { /* don't need argument */
121 | 		opt->arg = NULL;
122 | 		if (!*opt->place)
123 | 			opt->index++;
124 | 	} else {  /* need an argument */
125 | 		if (*opt->place) { /* no white space */
126 | 			opt->arg = opt->place;
127 | 		} else if (argc <= ++opt->index) { /* no arg */
128 | 			opt->place = "";
129 | 			if (*fmt == ':')
130 | 				return BADARG_E;
131 | 			if (opt->error)
132 | 				if (fprintf(stderr, "option requires an argument -- %c\n", opt->option) < 0)
133 | 					return BADIO_E;
134 | 			return BADCH_E;
135 | 		} else	{ /* white space */
136 | 			opt->arg = argv[opt->index];
137 | 		}
138 | 		opt->place = "";
139 | 		opt->index++;
140 | 	}
141 | 	return opt->option; /* dump back option letter */
142 | }
143 | 
144 | static int cdb_print(cdb_t *cdb, const cdb_file_pos_t *fp, FILE *output) {
145 | 	assert(cdb);
146 | 	assert(fp);
147 | 	assert(output);
148 | 	if (cdb_seek(cdb, fp->position) < 0)
149 | 		return -1;
150 | 	char buf[IO_BUFFER_SIZE];
151 | 	const size_t length = fp->length;
152 | 	for (size_t i = 0; i < length; i += sizeof buf) { /* N.B. Double buffering! */
153 | 		const size_t l = length - i;
154 | 		if (l > sizeof buf)
155 | 			return -1;
156 | 		assert(l <= sizeof buf);
157 | 		if (cdb_read(cdb, buf, MIN(sizeof buf, l)) < 0)
158 | 			return -1;
159 | 		if (fwrite(buf, 1, l, output) != l)
160 | 			return -1;
161 | 	}
162 | 	return 0;
163 | }
164 | 
165 | static inline void cdb_reverse_char_array(char * const r, const size_t length) {
166 | 	assert(r);
167 | 	const size_t last = length - 1;
168 | 	for (size_t i = 0; i < length / 2ul; i++) {
169 | 		const char t = r[i];
170 | 		r[i] = r[last - i];
171 | 		r[last - i] = t;
172 | 	}
173 | }
174 | 
175 | static unsigned cdb_number_to_string(char b[65 /* max int size in base 2, + NUL*/], cdb_word_t u, int base) {
176 | 	assert(b);
177 | 	assert(base >= 2 && base <= 10);
178 | 	unsigned i = 0;
179 | 	do {
180 | 		const cdb_word_t radix = base;
181 | 		const cdb_word_t q = u % radix;
182 | 		const cdb_word_t r = u / radix;
183 | 		b[i++] = q + '0';
184 | 		u = r;
185 | 		assert(i <= 64);
186 | 	} while (u);
187 | 	b[i] = '\0';
188 | 	cdb_reverse_char_array(b, i);
189 | 	assert(b[i] == '\0');
190 | 	return i;
191 | }
192 | 
193 | static int cdb_dump(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) {
194 | 	assert(cdb);
195 | 	assert(key);
196 | 	assert(value);
197 | 	assert(param);
198 | 	FILE *output = param;
199 | 	char kstr[64+1], vstr[64+2]; /* NOT INITIALIZED */
200 | 	kstr[0] = '+';
201 | 	const unsigned kl = cdb_number_to_string(kstr + 1, key->length, 10) + 1;
202 | 	vstr[0] = ',';
203 | 	const unsigned nl = cdb_number_to_string(vstr + 1, value->length, 10) + 1;
204 | 	if (fwrite(kstr, 1, kl, output) != kl)
205 | 		return -1;
206 | 	vstr[nl]     = ':';
207 | 	vstr[nl + 1] = '\0';
208 | 	if (fwrite(vstr, 1, nl + 1, output) != (nl + 1))
209 | 		return -1;
210 | 	if (cdb_print(cdb, key, output) < 0)
211 | 		return -1;
212 | 	if (fwrite("->", 1, 2, output) != 2)
213 | 		return -1;
214 | 	if (cdb_print(cdb, value, output) < 0)
215 | 		return -1;
216 | 	return fputc('\n', output) != '\n' ? -1 : 0;
217 | }
218 | 
219 | static int cdb_dump_keys(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) {
220 | 	assert(cdb);
221 | 	assert(key);
222 | 	assert(value);
223 | 	assert(param);
224 | 	UNUSED(value);
225 | 	FILE *output = param;
226 | 	char kstr[64+2]; /* NOT INITIALIZED */
227 | 	kstr[0] = '+';
228 | 	const unsigned kl = cdb_number_to_string(kstr + 1, key->length, 10) + 1;
229 | 	kstr[kl]     = ':';
230 | 	kstr[kl + 1] = '\0';
231 | 	if (fwrite(kstr, 1, kl + 1, output) != (kl + 1))
232 | 		return -1;
233 | 	if (cdb_print(cdb, key, output) < 0)
234 | 		return -1;
235 | 	return fputc('\n', output) != '\n' ? -1 : 0;
236 | }
237 | 
238 | static int cdb_string_to_number(const char *s, cdb_word_t *out) {
239 | 	assert(s);
240 | 	cdb_word_t result = 0;
241 | 	int ch = s[0];
242 | 	*out = 0;
243 | 	if (!ch)
244 | 		return -1;
245 | 	for (size_t j = 0; j < 64 && (ch = s[j]); j++) {
246 | 		const int digit = ch - '0';
247 | 		if (digit < 0 || digit > 9)
248 | 			return -1;
249 | 		result = digit + (result * (cdb_word_t)10ul);
250 | 	}
251 | 	if (ch)
252 | 		return -1;
253 | 	*out = result;
254 | 	return 0;
255 | }
256 | 
257 | static int scan(FILE *input, cdb_word_t *out, int delim) {
258 | 	assert(input);
259 | 	char b[64]; /* NOT INITIALIZED */
260 | 	size_t i = 0;
261 | 	int ch = 0;
262 | 	for (i = 0; i < sizeof (b) && (EOF != (ch = fgetc(input))) && isdigit(ch); i++)
263 | 		b[i] = ch;
264 | 	if (i == sizeof(b))
265 | 		return -1;
266 | 	b[i] = '\0';
267 | 	if (delim == 0) {
268 | 		if (ungetc(ch, input) < 0)
269 | 			return -1;
270 | 	} else if (ch != delim) {
271 | 		return -1;
272 | 	}
273 | 	return cdb_string_to_number(b, out);
274 | }
275 | 
276 | static int cdb_create(cdb_t *cdb, FILE *input) {
277 | 	assert(cdb);
278 | 	assert(input);
279 | 
280 | 	int r = 0;
281 | 	size_t kmlen = IO_BUFFER_SIZE, vmlen = IO_BUFFER_SIZE;
282 | 	char *key = malloc(kmlen);
283 | 	char *value = malloc(vmlen);
284 | 	if (!key || !value)
285 | 		goto fail;
286 | 
287 | 	for (;;) {
288 | 		cdb_word_t klen = 0, vlen = 0;
289 | 		char sep[2] = { 0, };
290 | 		const int first = fgetc(input);
291 | 		if (first == EOF) /* || first == '\n' {need to handle '\r' as well} */
292 | 			goto end;
293 | 		if (isspace(first))
294 | 			continue;
295 | 		if (first != '+')
296 | 			goto fail;
297 | 		if (scan(input, &klen, ',') < 0)
298 | 			goto fail;
299 | 		if (scan(input, &vlen, ':') < 0)
300 | 			goto fail;
301 | 		if (kmlen < klen) {
302 | 			char *t = realloc(key, klen);
303 | 			if (!t)
304 | 				goto fail;
305 | 			kmlen = klen;
306 | 			key = t;
307 | 		}
308 | 
309 | 		if (vmlen < vlen) {
310 | 			char *t = realloc(value, vlen);
311 | 			if (!t)
312 | 				goto fail;
313 | 			vmlen = vlen;
314 | 			value = t;
315 | 		}
316 | 
317 | 		if (fread(key, 1, klen, input) != klen)
318 | 			goto fail;
319 | 
320 | 		if (fread(sep, 1, sizeof sep, input) != sizeof sep)
321 | 			goto fail;
322 | 
323 | 		if (sep[0] != '-' || sep[1] != '>')
324 | 			goto fail;
325 | 
326 | 		if (fread(value, 1, vlen, input) != vlen)
327 | 			goto fail;
328 | 
329 | 		const cdb_buffer_t kb = { .length = klen, .buffer = key };
330 | 		const cdb_buffer_t vb = { .length = vlen, .buffer = value };
331 | 
332 | 		if (cdb_add(cdb, &kb, &vb) < 0) {
333 | 			(void)fprintf(stderr, "cdb file add failed\n");
334 | 			goto fail;
335 | 		}
336 | 		const int ch1 = fgetc(input);
337 | 		if (ch1 == '\n')
338 | 			continue;
339 | 		if (ch1 == EOF)
340 | 			goto end;
341 | 		if (ch1 != '\r')
342 | 			goto fail;
343 | 		if ('\n' != fgetc(input))
344 | 			goto fail;
345 | 	}
346 | fail:
347 | 	r = -1;
348 | end:
349 | 	free(key);
350 | 	free(value);
351 | 	return r;
352 | }
353 | 
354 | static int cdb_stats(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param) {
355 | 	assert(cdb);
356 | 	assert(key);
357 | 	assert(value);
358 | 	assert(param);
359 | 	UNUSED(cdb);
360 | 	cdb_statistics_t *cs = param;
361 | 	cs->records++;
362 | 	cs->total_key_length   += key->length;
363 | 	cs->total_value_length += value->length;
364 | 	cs->min_key_length      = MIN(cs->min_key_length,   key->length);
365 | 	cs->min_value_length    = MIN(cs->min_value_length, value->length);
366 | 	cs->max_key_length      = MAX(cs->max_key_length,   key->length);
367 | 	cs->max_value_length    = MAX(cs->max_value_length, value->length);
368 | 	return 0;
369 | }
370 | 
371 | static int cdb_stats_print(cdb_t *cdb, FILE *output, int verbose, size_t bytes) {
372 | 	assert(cdb);
373 | 	assert(output);
374 | 	unsigned long distances[DISTMAX] = { 0, };
375 | 	unsigned long entries = 0, occupied = 0, collisions = 0, hmin = ULONG_MAX, hmax = 0;
376 | 	double avg_key_length = 0, avg_value_length = 0, avg_hash_length = 0;
377 | 	cdb_statistics_t s = {
378 | 		.records          = 0,
379 | 		.min_key_length   = ULONG_MAX,
380 | 		.min_value_length = ULONG_MAX,
381 | 	};
382 | 
383 | 	if (cdb_foreach(cdb, cdb_stats, &s) < 0)
384 | 		return -1;
385 | 
386 | 	if (verbose)
387 | 		if (fputs("Initial hash table:\n", output) < 0)
388 | 			return -1;
389 | 
390 | 	for (size_t i = 0; i < 256; i++) {
391 | 		if (cdb_seek(cdb, i * (2ull * bytes)) < 0)
392 | 			return -1;
393 | 		cdb_word_t pos = 0, num = 0;
394 | 		if (cdb_read_word_pair(cdb, &pos, &num) < 0)
395 | 			return -1;
396 | 		if (verbose) {
397 | 			if ((i % 4) == 0)
398 | 				if (fprintf(output, "\n%3d:\t", (int)i) < 0)
399 | 					return -1;
400 | 			if (fprintf(output, "$%4lx %3ld, ", (long)pos, (long)num) < 0)
401 | 				return -1;
402 | 		}
403 | 
404 | 		collisions += num > 2ul;
405 | 		entries    += num;
406 | 		occupied   += num != 0;
407 | 		hmax        = MAX(num, hmax);
408 | 		if (num)
409 | 			hmin = MIN(num, hmin);
410 | 		if (cdb_seek(cdb, pos) < 0)
411 | 			return -1;
412 | 		for (size_t j = 0; j < num; j++) {
413 | 			cdb_word_t h = 0, p = 0;
414 | 			if (cdb_read_word_pair(cdb, &h, &p) < 0)
415 | 				return -1;
416 | 			if (!p)
417 | 				continue;
418 | 			h = (h >> 8) % num;
419 | 			if (h == j) {
420 | 				h = 0;
421 | 			} else {
422 | 				h = h < j ? j - h : num - h + j;
423 | 				h = MIN(h, DISTMAX - 1ul);
424 | 			}
425 | 			distances[h]++;
426 | 		}
427 | 	}
428 | 
429 | 	if (verbose)
430 | 		if (fputs("\n\n", output) < 0)
431 | 			return -1;
432 | 
433 | 	if (s.records == 0) {
434 | 		s.min_key_length = 0;
435 | 		s.min_value_length = 0;
436 | 		hmin = 0;
437 | 	} else {
438 | 		avg_key_length   = (double)s.total_key_length / (double) s.records;
439 | 		avg_value_length = (double)s.total_value_length / (double) s.records;
440 | 		avg_hash_length  = (double)entries / (double)occupied;
441 | 	}
442 | 
443 | 	if (fprintf(output, "records:\t\t\t%lu\n", s.records) < 0)
444 | 		return -1;
445 | 	if (fprintf(output, "key   min/max/avg/bytes:\t%lu/%lu/%g/%lu\n",
446 | 		s.min_key_length, s.max_key_length, avg_key_length, s.total_key_length) < 0)
447 | 		return -1;
448 | 	if (fprintf(output, "value min/max/avg/bytes:\t%lu/%lu/%g/%lu\n",
449 | 		s.min_value_length, s.max_value_length, avg_value_length, s.total_value_length) < 0)
450 | 		return -1;
451 | 	if (fprintf(output, "top hash table used/entries/collisions:\t%lu/%lu/%lu\n", occupied, entries, collisions) < 0)
452 | 		return -1;
453 | 	if (fprintf(output, "hash tables min/avg/max:\t%lu/%g/%lu\n", hmin, avg_hash_length, hmax) < 0)
454 | 		return -1;
455 | 	if (fprintf(output, "hash tables collisions/buckets:\t%lu/%lu\n", s.records - distances[0], entries) < 0)
456 | 		return -1;
457 | 	if (fputs("hash table distances:\n", output) < 0)
458 | 		return -1;
459 | 
460 | 	for (size_t i = 0; i < DISTMAX; i++) {
461 | 		const double pct = s.records ? ((double)distances[i] / (double)s.records) * 100.0 : 0.0;
462 | 		if (fprintf(output, "\td%u%s %4lu %5.2g%%\n", (unsigned)i, i == DISTMAX - 1ul ? "+:" : ": ", distances[i], pct) < 0)
463 | 			return -1;
464 | 	}
465 | 	return 0;
466 | }
467 | 
468 | static int cdb_query(cdb_t *cdb, char *key, int record, FILE *output) {
469 | 	assert(cdb);
470 | 	assert(key);
471 | 	assert(output);
472 | 	const cdb_buffer_t kb = { .length = strlen(key), .buffer = key };
473 | 	cdb_file_pos_t vp = { 0, 0, };
474 | 	const int gr = cdb_lookup(cdb, &kb, &vp, record);
475 | 	if (gr < 0)
476 | 		return -1;
477 | 	if (gr > 0) /* found */
478 | 		return cdb_print(cdb, &vp, output) < 0 ? -1 : 0;
479 | 	return 2; /* not found */
480 | }
481 | 
482 | /* We should output directly to a database as well... */
483 | static int generate(FILE *output, unsigned long records, unsigned long min, unsigned long max, unsigned long seed) {
484 | 	assert(output);
485 | 	uint64_t s[2] = { seed, 0, };
486 | 	if (max == 0)
487 | 		max = 1024;
488 | 	if (min > max)
489 | 		min = max;
490 | 	if ((max + min) > max)
491 | 		return -1;
492 | 	for (uint64_t i = 0; i < records; i++) {
493 | 		const unsigned long kl = (cdb_prng(s) % (max + min)) + min; /* adds bias but so what fight me */
494 | 		const unsigned long vl = (cdb_prng(s) % (max + min)) + min;
495 | 		if (fprintf(output, "+%lu,%lu:", kl, vl) < 0)
496 | 			return -1;
497 | 		for (unsigned long j = 0; j < kl; j++)
498 | 			if (fputc('a' + (cdb_prng(s) % 26), output) < 0)
499 | 				return -1;
500 | 		if (fputs("->", output) < 0)
501 | 			return -1;
502 | 		for (unsigned long j = 0; j < vl; j++)
503 | 			if (fputc('a' + (cdb_prng(s) % 26), output) < 0)
504 | 				return -1;
505 | 		if (fputc('\n', output) < 0)
506 | 			return -1;
507 | 	}
508 | 	if (fputc('\n', output) < 0)
509 | 		return -1;
510 | 	return 0;
511 | }
512 | 
513 | static int hasher(FILE *input, FILE *output) { /* should really input keys in "+length:key\n" format */
514 | 	assert(input);
515 | 	assert(output);
516 | 	char line[512] = { 0, }; /* long enough for everyone right? */
517 | 	for (; fgets(line, sizeof line, input); line[0] = 0) {
518 | 		size_t l = strlen(line);
519 | 		if (l && line[l-1] == '\n')
520 | 			line[l--] = 0;
521 | 		if (fprintf(output, "0x%08lx\n", (unsigned long)cdb_hash((uint8_t*)line, l)) < 0)
522 | 			return -1;
523 | 	}
524 | 	return 0;
525 | }
526 | 
527 | static int help(FILE *output, const char *arg0) {
528 | 	assert(output);
529 | 	assert(arg0);
530 | 	unsigned long version = 0;
531 | 	if (cdb_version(&version) < 0)
532 | 		info("version not set - built incorrectly");
533 | 	const unsigned q = (version >> 24) & 0xff;
534 | 	const unsigned x = (version >> 16) & 0xff;
535 | 	const unsigned y = (version >>  8) & 0xff;
536 | 	const unsigned z = (version >>  0) & 0xff;
537 | 	static const char *usage = "\
538 | Usage   : %s -hv *OR* -[rcdkstVT] file.cdb *OR* -q file.cdb key [record#] *OR* -g *OR* -H\n\
539 | Program : Constant Database Driver (clone of https://cr.yp.to/cdb.html)\n\
540 | Author  : " CDB_AUTHOR "\n\
541 | Email   : " CDB_EMAIL "\n\
542 | Repo    : " CDB_REPO "\n\
543 | License : " CDB_LICENSE "\n\
544 | Version : %u.%u.%u\n\
545 | Options : 0x%x\n\
546 | Size    : %d\n\
547 | Notes   : See manual pages or project website for more information.\n\n\
548 | Options :\n\n\
549 | \t-h          : print this help message and exit successfully\n\
550 | \t-v          : increase verbosity level\n\
551 | \t-c file.cdb : create a new database reading keys from stdin\n\
552 | \t-d file.cdb : dump entire database\n\
553 | \t-k file.cdb : dump all keys (there may be duplicates)\n\
554 | \t-s file.cdb : calculate database statistics\n\
555 | \t-t file.cdb : run internal tests generating a test file\n\
556 | \t-T temp.cdb : name of temporary file to use\n\
557 | \t-V file.cdb : validate database\n\
558 | \t-q file.cdb key #? : run query for key with optional record number\n\
559 | \t-b size     : database size (valid sizes = 16, 32 (default), 64)\n\
560 | \t-o number   : specify offset into file where database begins\n\
561 | \t-H          : hash keys and output their hash\n\
562 | \t-g          : spit out an example database *dump* to standard out\n\
563 | \t-m number   : set minimum length of generated record\n\
564 | \t-M number   : set maximum length of generated record\n\
565 | \t-R number   : set number of generated records\n\
566 | \t-S number   : set seed for record generation\n\n\
567 | In create mode the key input format is:\n\n\
568 | \t+key-length,value-length:key->value\n\n\
569 | An example:\n\n\
570 | \t+5,5:hello->world\n\n\
571 | Queries are in a similar format:\n\n\
572 | \t+key-length:key\n\n\
573 | Binary key/values are allowed, as are duplicate and empty keys/values.\n\
574 | Returns values of 0 indicate success/found, 2 not found, and anything else\n\
575 | indicates an error.\n\
576 | ";
577 | 	return fprintf(output, usage, arg0, x, y, z, q,(int)(sizeof (cdb_word_t) * CHAR_BIT));
578 | }
579 | 
580 | int main(int argc, char **argv) {
581 | 	enum { QUERY, DUMP, CREATE, STATS, KEYS, VALIDATE, GENERATE, };
582 | 	const char *file = NULL;
583 | 	char *tmp = NULL;
584 | 	int mode = VALIDATE, creating = 0;
585 | 	unsigned long min = 0ul, max = 1024ul, records = 1024ul, seed = 0ul;
586 | 
587 | 	binary(stdin);
588 | 	binary(stdout);
589 | 	binary(stderr);
590 | 
591 | 	char ibuf[BUFSIZ], obuf[BUFSIZ]; /* NOT INITIALIZED */
592 | 	if (setvbuf(stdin, ibuf, _IOFBF, sizeof ibuf) < 0)
593 | 		return -1;
594 | 	if (setvbuf(stdout, obuf, _IOFBF, sizeof obuf) < 0)
595 | 		return -1;
596 | 
597 | 	cdb_options_t ops = cdb_host_options;
598 | 
599 | 	cdb_getopt_t opt = { .init = 0 };
600 | 	for (int ch = 0; (ch = cdb_getopt(&opt, argc, argv, "hHgvt:c:d:k:s:q:V:b:T:m:M:R:S:o:G:")) != -1; ) {
601 | 		switch (ch) {
602 | 		case 'h': return help(stdout, argv[0]), 0;
603 | 		case 'H': return hasher(stdin, stdout);
604 | 		case 't': return -cdb_tests(&ops, opt.arg);
605 | 		case 'v': verbose++;                       break;
606 | 		case 'c': file = opt.arg; mode = CREATE;   break;
607 | 		case 'd': file = opt.arg; mode = DUMP;     break;
608 | 		case 'k': file = opt.arg; mode = KEYS;     break;
609 | 		case 's': file = opt.arg; mode = STATS;    break;
610 | 		case 'q': file = opt.arg; mode = QUERY;    break;
611 | 		case 'V': file = opt.arg; mode = VALIDATE; break;
612 | 		case 'g': mode = GENERATE;                 break;
613 | 		case 'T': assert(opt.arg); tmp  = opt.arg; break;
614 | 		case 'b': assert(opt.arg); ops.size   = atol(opt.arg); break;
615 | 		case 'm': assert(opt.arg); min        = atol(opt.arg); break;
616 | 		case 'M': assert(opt.arg); max        = atol(opt.arg); break;
617 | 		case 'R': assert(opt.arg); records    = atol(opt.arg); break;
618 | 		case 'S': assert(opt.arg); seed       = atol(opt.arg); break;
619 | 		case 'o': assert(opt.arg); ops.offset = atol(opt.arg); break;
620 | 		default: help(stderr, argv[0]); return 1;
621 | 		}
622 | 	}
623 | 
624 | 	/* N.B. We could also generate a CDB file directly as well,
625 | 	 * instead of generating a dump, the "generate" function
626 | 	 * would need a rewrite though */
627 | 	if (mode == GENERATE) {
628 | 		int r = generate(stdout, records, min, max, seed);
629 | 		/* Valgrind reports errors (on my setup) when writing to
630 | 		 * stdout and not flushing, the flush is called in the exit
631 | 		 * code and causes an error even though nothing *seems*
632 | 		 * incorrect. */
633 | 		if (fflush(stdout) < 0)
634 | 			r = -1;
635 | 		return r < 0 ? 1 : 0;
636 | 	}
637 | 
638 | 	/* For many of the modes "file" could be "stdout", this works
639 | 	 * for everything bar CREATE mode which will need to seek on
640 | 	 * its output. */
641 | 	if (!file)
642 | 		return help(stderr, argv[0]), 1;
643 | 
644 | 	creating = mode == CREATE;
645 | 
646 | 	cdb_t *cdb = NULL;
647 | 	const char *name = creating && tmp ? tmp : file;
648 | 	info("opening '%s' for %s", name, creating ? "writing" : "reading");
649 | 	const int etmp = errno;
650 | 	errno = 0;
651 | 	if (cdb_open(&cdb, &ops, creating, name) < 0) {
652 | 		const char *f = errno ? strerror(errno) : "unknown";
653 | 		const char *m = creating ? "create" : "read";
654 | 		die("opening file '%s' in %s mode failed: %s", name, m, f);
655 | 	}
656 | 	errno = etmp;
657 | 
658 | 	int r = 0;
659 | 	switch (mode) {
660 | 	case CREATE:   r = cdb_create(cdb, stdin);                                                       break;
661 | 	case DUMP:     r = cdb_foreach(cdb, cdb_dump,      stdout); if (fputc('\n', stdout) < 0) r = -1; break;
662 | 	case KEYS:     r = cdb_foreach(cdb, cdb_dump_keys, stdout); if (fputc('\n', stdout) < 0) r = -1; break;
663 | 	case STATS:    r = cdb_stats_print(cdb, stdout, 0, ops.size / 8ul);                              break;
664 | 	case VALIDATE: r = cdb_foreach(cdb, NULL, NULL);                                                 break;
665 | 	case QUERY: {
666 | 		if (opt.index >= argc)
667 | 			die("-q opt requires key (and optional record number)");
668 | 		char *key = argv[opt.index++];
669 | 		r = cdb_query(cdb, key, opt.index < argc ? atoi(argv[opt.index++]) : 0, stdout);
670 | 		break;
671 | 	}
672 | 	default:
673 | 		die("unimplemented mode: %d", mode);
674 | 	}
675 | 	if (fflush(stdout) < 0)
676 | 		r = -1;
677 | 
678 | 	const int cdbe = cdb_status(cdb);
679 | 	if (cdb_close(cdb) < 0)
680 | 		die("close failed: %d", cdbe);
681 | 	if (cdbe < 0)
682 | 		die("cdb internal error: %d", cdbe);
683 | 
684 | 	if (creating && tmp) {
685 | 		info("renaming temporary file");
686 | 		if (rename(tmp, file) < 0)
687 | 			die("rename from '%s' to '%s' failed: %s", tmp, file, strerror(errno));
688 | 	}
689 | 	return r < 0 ? 1 : 0;
690 | }
691 | 
692 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | # CDB makefile - default target should build everything
 2 | #
 3 | VERSION =0x080000ul
 4 | CFLAGS  =-Wall -Wextra -fPIC -std=c99 -O3 -pedantic -fwrapv -Wmissing-prototypes -DCDB_VERSION="${VERSION}" ${DEFINES} ${EXTRA} 
 5 | TARGET  =cdb
 6 | AR      =ar
 7 | ARFLAGS =rcs
 8 | RANLIB  =ranlib
 9 | DESTDIR =install
10 | 
11 | ifeq ($(OS),Windows_NT)
12 | DLL=dll
13 | else # Assume Unixen
14 | DLL=so
15 | CFLAGS+=-D_FILE_OFFSET_BITS=64 
16 | endif
17 | 
18 | .PHONY: all test clean dist install
19 | 
20 | all: ${TARGET}
21 | 
22 | cdb.o: cdb.c cdb.h makefile
23 | 
24 | host.o: host.c host.h cdb.h makefile
25 | 
26 | main.o: main.c host.o cdb.h makefile
27 | 
28 | lib${TARGET}.a: ${TARGET}.o ${TARGET}.h
29 | 	${AR} ${ARFLAGS} $@ $<
30 | 	${RANLIB} $@
31 | 
32 | lib${TARGET}.${DLL}: ${TARGET}.o ${TARGET}.h
33 | 	${CC} ${CFLAGS} -shared ${TARGET}.o -o $@
34 | 
35 | ${TARGET}: main.o host.o lib${TARGET}.a
36 | 	${CC} $^ -o $@
37 | 	-strip ${TARGET}
38 | 
39 | test.cdb: ${TARGET}
40 | 	./${TARGET} -t test.cdb
41 | 
42 | test: test.cdb
43 | 
44 | ${TARGET}.1: readme.md
45 | 	-pandoc -s -f markdown -t man $< -o $@
46 | 
47 | .git:
48 | 	git clone https://github.com/howerj/cdb cdb-repo
49 | 	mv cdb-repo/.git .
50 | 	rm -rf cdb-repo
51 | 
52 | install: ${TARGET} lib${TARGET}.a lib${TARGET}.${DLL} ${TARGET}.1 .git
53 | 	install -p -D ${TARGET} ${DESTDIR}/bin/${TARGET}
54 | 	install -p -m 644 -D lib${TARGET}.a ${DESTDIR}/lib/lib${TARGET}.a
55 | 	install -p -D lib${TARGET}.${DLL} ${DESTDIR}/lib/lib${TARGET}.${DLL}
56 | 	install -p -m 644 -D ${TARGET}.h ${DESTDIR}/include/${TARGET}.h
57 | 	-install -p -m 644 -D ${TARGET}.1 ${DESTDIR}/man/${TARGET}.1
58 | 	mkdir -p ${DESTDIR}/src
59 | 	cp -a .git ${DESTDIR}/src
60 | 	cd ${DESTDIR}/src && git reset --hard HEAD
61 | 
62 | dist: install
63 | 	tar zcf ${TARGET}-${VERSION}.tgz ${DESTDIR}
64 | 
65 | clean: .git
66 | 	git clean -dffx
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
   1 | % cdb(1) | Constant Database
   2 | 
   3 | # NAME
   4 | 
   5 | CDB - An interface to the Constant Database Library
   6 | 
   7 | # SYNOPSES
   8 | 
   9 | cdb -h
  10 | 
  11 | cdb -\[cdkstVG\] file.cdb
  12 | 
  13 | cdb -q file.cdb key \[record#\]
  14 | 
  15 | cdb -g -M minimum -M maximum -R records -S seed
  16 | 
  17 | cdb -H
  18 | 
  19 | # DESCRIPTION
  20 | 
  21 | 	Author:     Richard James Howe
  22 | 	License:    Unlicense
  23 | 	Repository: <https://github.com/howerj/cdb>
  24 | 	Email:      howe.r.j.89@gmail.com
  25 | 
  26 | A clone of the [CDB][] database, a simple, read-only (once created) database.
  27 | The database library is designed so it can be embedded into a microcontroller
  28 | if needed. This program can be used for creating and querying CDB databases,
  29 | which consist of key-value pairs of binary data.
  30 | 
  31 | This program also includes several options that help in testing out the
  32 | database, one for hashing input keys and printing the hash for the default hash
  33 | function and another one for generating a database with (Pseudo-)random keys
  34 | and values of a given length.
  35 | 
  36 | **This library can create 16, 32 and 64 bit versions of the CDB file format
  37 | removing one of the major limitations of the 32-bit version.**
  38 | 
  39 | **The 64-bit version of the database uses a different hash than djb2**.
  40 | 
  41 | # OPTIONS
  42 | 
  43 | **-h** : print out this help message and exit successfully
  44 | 
  45 | **-b** : set the size of the CDB database to use (default is 32, can be 16 or 64)
  46 | 
  47 | **-v**: increase verbosity level
  48 | 
  49 | **-t** *file.cdb* : run internal tests, exit with zero on a pass
  50 | 
  51 | **-c**  *file.cdb* : run in create mode
  52 | 
  53 | **-d**  *file.cdb* : dump the database
  54 | 
  55 | **-k**  *file.cdb* : dump the keys in the database
  56 | 
  57 | **-s**  *file.cdb* : print statistics about the database
  58 | 
  59 | **-T** *temp.cdb* : name of temporary file to use
  60 | 
  61 | **-V**  *file.cdb* : validate database
  62 | 
  63 | **-q**  *file.cdb key record-number* : query the database for a key, with an optional record
  64 | 
  65 | **-o** number : specify offset into file where database begins
  66 | 
  67 | **-H** : hash keys and output their hash
  68 | 
  69 | **-g**  : spit out an example database to standard out
  70 | 
  71 | **-m** number   : set minimum length of generated record
  72 | 
  73 | **-M** number   : set maximum length of generated record
  74 | 
  75 | **-R** number   : set number of generated records
  76 | 
  77 | **-S** number   : set seed for record generation
  78 | 
  79 | # EXAMPLES
  80 | 
  81 | Creating a database, called 'example.cdb':
  82 | 
  83 | 	$ ./cdb -c example.cdb
  84 | 	+0,1:->X
  85 | 	+1,0:Y->
  86 | 	+1,1:a->b
  87 | 	+1,1:a->b
  88 | 	+1,2:a->ba
  89 | 	+5,5:hello->world
  90 | 
  91 | Note that zero length keys and values are valid, and that duplicate keys are
  92 | allowed, even keys with the same value. A key with the specified value is
  93 | created for each duplicate, just like a non-duplicate key.
  94 | 
  95 | Looking up values in the created database:
  96 | 
  97 | 	./cdb -q example.cdb ""
  98 | 	./cdb -q example.cdb Y
  99 | 	./cdb -q example.cdb a
 100 | 	./cdb -q example.cdb a 0
 101 | 	./cdb -q example.cdb a 1
 102 | 	./cdb -q example.cdb a 2
 103 | 	./cdb -q example.cdb hello
 104 | 
 105 | Dumping a database:
 106 | 
 107 | 	$ ./cdb -d example.cdb
 108 | 
 109 | A database dump can be read straight back in to create another database:
 110 | 
 111 | 	$ ./cdb -d example.cdb | ./cdb -c should_have_just_used_copy.cdb
 112 | 
 113 | Which is not useful in itself, but *assuming* your data (both keys and
 114 | values) is ASCII text with no new lines and NUL characters then you could
 115 | filter out, modify or add in values with the standard Unix command line
 116 | tools.
 117 | 
 118 | # RETURN VALUE
 119 | 
 120 | cdb returns zero on success/key found, and a non zero value on failure. Two is
 121 | returned if a key is not found, any other value indicates a more serious
 122 | failure.
 123 | 
 124 | # LIMITATIONS
 125 | 
 126 | Three different versions of the library can be built; a 16, a 32 and a 64 bit
 127 | version. The 32 bit version is the default version. For all versions there is a
 128 | limit on the maximum file size in the format used of 2^N, where N is the size.
 129 | Keys and Values have the same limit (although they can never reach that size as
 130 | some of the overhead is taken up as part of the file format). Any other
 131 | arbitrary limitation is a bug in the implementation.
 132 | 
 133 | The minimum size of a CDB file is 256 \* 2 \* (N/8) bytes.
 134 | 
 135 | It should be noted that if you build a N bit (where N is 16, 32 or 64) 
 136 | version of this library you are limited to creating databases that are the
 137 | size of N and less, e.g. If `cdb_word_t` is set to `uint32_t`, and therefore
 138 | the 32-bit version of this library is being built, then you can create 32-bit
 139 | and 16-bit versions of the CDB database format, but you cannot make 64-bit
 140 | versions. You can set `cdb_word_t` to `uint64_t` (which enables the library
 141 | to create all three mutually incompatible versions of the library) on a
 142 | 32-bit system, naturally.
 143 | 
 144 | # INPUT/DUMP FORMAT
 145 | 
 146 | The input and dump format follow the same pattern, some ASCII text specifying
 147 | the beginning of a record and then some binary data with some separators, and
 148 | a newline terminating the record, the format is:
 149 | 
 150 | 	+key-length,value-length:KEY->VALUE
 151 | 	+key-length,value-length:KEY->VALUE
 152 | 	...
 153 | 	+key-length,value-length:KEY->VALUE
 154 | 
 155 | Despite the presence of textual data, the input key and value can contain
 156 | binary data, including the ASCII NUL character.
 157 | 
 158 | An example, encoding the key value pair "abc" to "def" and "G" to "hello":
 159 | 
 160 | 	+3,3:abc->def
 161 | 	+1,5:G->hello
 162 | 
 163 | The following [awk][] script can be used to pre-process a series of key-value
 164 | pairs in the format "key value", with one record per line and optional comment
 165 | lines:
 166 | 
 167 | 	#!/bin/sh
 168 | 	LC_ALL='C' awk '
 169 | 	  /^[^#]/ {
 170 | 	    print "+" length($1) "," length($2) ":" $1 "->" $2
 171 | 	  }
 172 | 	  END {
 173 | 	    print ""
 174 | 	  }
 175 | 	' | cdb -c "$@"
 176 | 
 177 | Which was available in the original [original cdb][] program as 'cdbmake-12'.
 178 | 
 179 | # FILE FORMAT
 180 | 
 181 | The file format is incredibly simple, it is designed so that only the header
 182 | and the hash table pointer need to be stored in memory during generation of the
 183 | table - the keys and values can be streamed on to the disk. The header consists
 184 | of 256 2-word values forming an initial hash table that point to the hash
 185 | tables at the end of the file, the key-value records, and then up to 256 hash
 186 | tables pointing to the key-value pairs.
 187 | 
 188 | A word consists of a 4-byte/32-bit value (although this may be changed via
 189 | compile time options, creating an incompatible format). All word values are
 190 | stored in little-endian format.
 191 | 
 192 | The initial hash table contains an array of 256 2-word values.
 193 | The words are; a position of a hash table in the file and the number of buckets
 194 | in that hash table, stored in that order. To lookup a key the key is first
 195 | hashed, the lowest eight bits of the hash are used to index into the initial table
 196 | and if there are values in this hash the search then proceeds to the second hash
 197 | table at the end of the file.
 198 | 
 199 | The hash tables at the end of the file contains an array of two word records,
 200 | containing the full hash and a file position of the key-value pair. To search
 201 | for a key in this table the hash of the key is taken and the lowest eight bits
 202 | are discarded by shifting right eight places, the hash is then taken modulo the
 203 | number of elements in the hash table, the resulting value is used as an initial
 204 | index into the hash table. Searching continues until the key is found, or an
 205 | empty record is found, or the number of records in the table have been searched
 206 | through with no match. A key is compared by looking at the hash table records,
 207 | if the hash of the key matches the stored hash in the hash table records then a
 208 | possible match is found, the file position is then used to look up the
 209 | key-value pair and the key is compared.
 210 | 
 211 | The number of buckets in the hash table is chosen as twice the number of
 212 | populated entries in the hash table.
 213 | 
 214 | A key-value pair is stored as two words containing the key length and the value
 215 | length in that order, then the key, and finally the value.
 216 | 
 217 | The hashing algorithm used is similar to [djb2][] (except for the 64-bit
 218 | version, which uses a 64-bit variant of SDBM hash), but with a minor modification that 
 219 | an exclusive-or replaces an addition. 
 220 | 
 221 | The algorithm calculates hashes of the size of a word, the initial hash value is the special 
 222 | number '5381'.  The hash is calculated as the current hash value multiplied by 33, to which the
 223 | new byte to be hashes and the result of multiplication under go an exclusive-or
 224 | operation. This repeats until all bytes to be hashed are processed. All
 225 | arithmetic operations are unsigned and performed modulo 2 raised to the power
 226 | of 32.
 227 | 
 228 | The pseudo code for this is:
 229 | 
 230 | 	set HASH to 5381
 231 | 	for each OCTET in INPUT:
 232 | 		set HASH to: ((HASH * 33) % pow(2, 32)) xor OCTET
 233 | 	return HASH
 234 | 
 235 | Note that there is nothing in the file format that disallows duplicate keys in
 236 | the database, in fact the API allows duplicate keys to be retrieved. Both key
 237 | and data values can also be zero bytes long. There are also no special
 238 | alignment requirements on the data.
 239 | 
 240 | The best documentation on the file format is a small pure python script that
 241 | implements a set of functions for manipulating a CDB database, a description is
 242 | available here <http://www.unixuser.org/~euske/doc/cdbinternals/> and the
 243 | script itself is available at the bottom of that page
 244 | <http://www.unixuser.org/~euske/doc/cdbinternals/pycdb.py>.
 245 | 
 246 | A visualization of the overall file structure:
 247 | 
 248 | 	         Constant Database Sections
 249 | 	.-------------------------------------------.
 250 | 	|   256 Bucket Initial Hash Table (2KiB)    |
 251 | 	.-------------------------------------------.
 252 | 	|            Key Value Pairs                |
 253 | 	.-------------------------------------------.
 254 | 	|       0-256 Secondary Hash Tables         |
 255 | 	.-------------------------------------------.
 256 | 
 257 | The initial hash table at the start of the file:
 258 | 
 259 | 	    256 Bucket Initial Hash Table (2KiB)
 260 | 	.-------------------------------------------.
 261 | 	| { P, L } | { P, L } | { P, L } |   ...    |
 262 | 	.----------+----------+----------+----------.
 263 | 	|   ...    | { P, L } | { P, L } | { P, L } |
 264 | 	.-------------------------------------------.
 265 | 	P = Position of secondary hash table
 266 | 	L = Number of buckets in secondary hash table
 267 | 
 268 | The key-value pairs:
 269 | 
 270 | 	.-------------------------------------------.
 271 | 	| { KL, VL } | KEY ...      | VALUE ...     |
 272 | 	.-------------------------------------------.
 273 | 	KL    = Key Length
 274 | 	VL    = Value Length
 275 | 	KEY   = Varible length binary data key
 276 | 	VALUE = Variable length binary value
 277 | 
 278 | Of the variable number of hash tables (which each are of a variable length) at
 279 | the end of the file:
 280 | 
 281 | 	 0-256 Variable Length Secondary Hash Tables
 282 | 	.---------------------.
 283 | 	| { H, P } | { H, P } |
 284 | 	.----------+----------+---------------------.
 285 | 	| { H, P } |   ...    |   ...    | { H, P } |
 286 | 	.----------+----------+----------+----------.
 287 | 	| { H, P } |   ...    | { H, P } |
 288 | 	.--------------------------------.
 289 | 	H = Hash
 290 | 	P = Position of Key-Value Pair
 291 | 
 292 | And that is all for the file format description.
 293 | 
 294 | While the keys-value pairs can be streamed to disk and the second level hash
 295 | table written after those keys, anything that creates a database will have
 296 | to seek to the beginning of the file to rewrite the header, this could have
 297 | been avoided by storing the 256 initial hash table results at the end of
 298 | the file allowing a database to be constructed in a Unix filter, but alas,
 299 | this is not possible. Also of note, by passing in a custom hash algorithm to
 300 | the C API you have much more control over where each of the key-value pairs
 301 | get stored, specifically, which bucket they will end up in by controlling
 302 | the lowest 8-bits (for example you could set the lowest 8-bits to the first
 303 | byte in the key in a custom hash).
 304 | 
 305 | Note that there is nothing stopping you storing the key-value pairs in
 306 | some kind of order, you could do this by adding the keys in lexicographic
 307 | order for a database sorted by key. Retrieving keys using the C function
 308 | "cdb\_foreach" would allow you retrieve keys in order. The hash table itself
 309 | would remain unaware of this order. Dumping the key-value pairs would maintain
 310 | this order as well. There is no guarantee other tools will preserve this
 311 | order however (they may dump key-value pairs backwards, or by going through
 312 | the hash table).
 313 | 
 314 | # CDB C API OVERVIEW
 315 | 
 316 | There are a few goals that the API has:
 317 | 
 318 | * Simplicity, there should be few functions and data structures.
 319 | * The API is easy to use.
 320 | * There should be minimal dependencies on the C standard library. The
 321 |   library itself should be small and not be a huge, non-portable, "optimized",
 322 |   mess.
 323 | * The user should decide when, where and how allocations are performed. The
 324 |   working set that is allocated should be small.
 325 | * The database driver should catch corrupt files if possible.
 326 | 
 327 | Some of these goals are in conflict, being able to control allocations and
 328 | having minimal dependencies allow the library to be used in an embedded system,
 329 | however it means that in order to do very basic things the user has to
 330 | provide a series of callbacks. The callbacks are simple to implement on a
 331 | hosted system, examples are provided in [main.c][] and [host.c][] in the
 332 | project repository, but this means the library is not just read to use.
 333 | 
 334 | There are two sets of operations that most users will want to perform; creating
 335 | a database and reading keys. After the callbacks have been provided, to create
 336 | a database requires opening up a new database in create mode:
 337 | 
 338 | 	/* error handling omitted for brevity */
 339 | 	cdb_t *cdb = NULL;
 340 | 	cdb_options_t ops = { /* Your file callbacks/options go here */ };
 341 | 	cdb_open(&cdb, &ops, 1, "example.cdb");
 342 | 	cdb_buffer_t key   = { .length = 5, .buffer = "hello", };
 343 | 	cdb_buffer_t value = { .length = 5, .buffer = "world", };
 344 | 	cdb_add(cdb, &key, &value);
 345 | 	cdb_close(cdb);
 346 | 
 347 | If you are dealing with mostly NUL terminated ASCII/UTF-8 strings it is worth
 348 | creating a function to deal with them:
 349 | 
 350 | 	int cdb_add_string(cdb_t *cdb, const char *key, const char *value) {
 351 | 		assert(cdb);
 352 | 		assert(key);
 353 | 		assert(value);
 354 | 		const cdb_buffer_t k = { .length = strlen(key),   .buffer = (char*)key,   };
 355 | 		const cdb_buffer_t v = { .length = strlen(value), .buffer = (char*)value, };
 356 | 		return cdb_add(cdb, &k, &v);
 357 | 	}
 358 | 
 359 | Note that you *cannot* query for a key from a database opened up in create
 360 | mode and you *cannot* add a key-value pair to a database opened up in read
 361 | mode. The operations are mutually exclusive.
 362 | 
 363 | To search for a key within the database, you open up a database connection in
 364 | read mode (create = 0):
 365 | 
 366 | 	/* error handling omitted for brevity */
 367 | 	cdb_t *cdb = NULL;
 368 | 	cdb_options_t ops = { /* Your file callbacks/options go here */ };
 369 | 	cdb_open(&cdb, &ops, 1, "example.cdb");
 370 | 	cdb_buffer_t key = { .length = 5, .buffer = "hello" };
 371 | 	cdb_file_pos_t value = { 0, 0, };
 372 | 	cdb_get(cdb, &key, &value);
 373 | 	/* use cdb_seek, then cdb_read, to use returned value */
 374 | 	cdb_close(cdb);
 375 | 
 376 | Upon retrieval of a key the database does not allocate a value for you, instead
 377 | it provides an object consisting of a file position and a length of the value.
 378 | This can be read from wherever the database is stored with the function
 379 | 'cdb\_read'. Before issuing a read, 'cdb\_seek' *must* be called as the file
 380 | handle may be pointing to a different area in the database.
 381 | 
 382 | If a read or a seek is issued that goes outside of the bounds of the database
 383 | then all subsequent database operations on that handle will fail, not just
 384 | reads or seeks. The only valid things to do on a database that has returned a
 385 | negative number is to call 'cdb\_status' and then 'cdb\_close' and never
 386 | use the handle again. 'cdb\_status' must not be used on a closed handle.
 387 | 
 388 | As there are potentially duplicate keys, the function 'cdb\_count' can be
 389 | used to query for duplicates. It sets the parameter count to the number of
 390 | records found for that key (and it sets count to zero, and returns zero, if no
 391 | keys are found, it returns one if one or more keys were found).
 392 | 
 393 | The function 'cdb\_status' can be used to query what error has occurred, if
 394 | any. On an error a negative value is returned, the meaning of this value is
 395 | deliberately not included in the header as the errors recorded and the
 396 | meaning of their values may change. Use the source for the library to determine
 397 | what error occurred.
 398 | 
 399 | The function 'cdb\_version' returns the version number in an out parameter
 400 | and information about the compile time options selected when the library was built.
 401 | A [Semantic Version Number][] is used, which takes the form "MAJOR.MINOR.PATCH".
 402 | The PATCH number is stored in the Least Significant Byte, the MINOR number the
 403 | next byte up, and the MAJOR in the third byte. The fourth byte contains the
 404 | compile time options.
 405 | 
 406 | There are several things that could be done to speed up the database but this
 407 | would complicate the implementation and the API.
 408 | 
 409 | ## C API FUNCTIONS
 410 | 
 411 | The C API contains 13 functions and some callbacks, more than is
 412 | desired, but they all have their uses. Ideally a library would
 413 | contain far fewer functions and require less of a cognitive burden
 414 | on the user to get right, however making a generic enough C library
 415 | and using C in general requires more complexity than is usual, but
 416 | not more than is necessary.
 417 | 
 418 | There is regularity in these functions, they all return negative
 419 | on failure (the only exception being the allocator callback that
 420 | returns a pointer), most of the functions accept a "cdb\_t" structure
 421 | as well, which is an [opaque pointer][] (opaque pointers are not
 422 | an unalloyed good, they imply that an allocator must be used, which
 423 | can be a problem in embedded systems).
 424 | 
 425 | 	int cdb_open(cdb_t **cdb, const cdb_options_t *ops, int create, const char *file);
 426 | 	int cdb_close(cdb_t *cdb);
 427 | 	int cdb_read(cdb_t *cdb, void *buf, cdb_word_t length);
 428 | 	int cdb_add(cdb_t *cdb, const cdb_buffer_t *key, const cdb_buffer_t *value);
 429 | 	int cdb_seek(cdb_t *cdb, cdb_word_t position);
 430 | 	int cdb_foreach(cdb_t *cdb, cdb_callback cb, void *param);
 431 | 	int cdb_read_word_pair(cdb_t *cdb, cdb_word_t *w1, cdb_word_t *w2);
 432 | 	int cdb_get(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value);
 433 | 	int cdb_lookup(cdb_t *cdb, const cdb_buffer_t *key, cdb_file_pos_t *value, long record);
 434 | 	int cdb_count(cdb_t *cdb, const cdb_buffer_t *key, long *count);
 435 | 	int cdb_status(cdb_t *cdb);
 436 | 	int cdb_version(unsigned long *version);
 437 | 	int cdb_tests(const cdb_options_t *ops, const char *test_file);
 438 | 
 439 | 	typedef int (*cdb_callback)(cdb_t *cdb, const cdb_file_pos_t *key, const cdb_file_pos_t *value, void *param);
 440 | 
 441 | * cdb\_open
 442 | 
 443 | The most complex function that contains the most parameters, "cdb\_open"
 444 | is used to open a connection to a database. A pointer to a handle is
 445 | passed to the first parameter, using the supplied allocation callback
 446 | (passed-in in the "ops" parameter) the function will allocate enough space
 447 | for "cdb\_t" structure, this out-parameter is the database handle. It will
 448 | be set to NULL on failure, which will also be indicated with a negative
 449 | return value on the "cdb\_open" function. Once "cdb\_close" is called on
 450 | this handle the handle *should not* be used again, and "cdb\_close" should
 451 | only be called on the returned handle *once*.
 452 | 
 453 | A single database can be opened by as many readers as you like, however
 454 | reading a database and writing to a database are mutually exclusive operations.
 455 | 
 456 | When writing to a database there *should not* be any readers active on
 457 | that database. This is a fundamental limitation of the database design.
 458 | 
 459 | Writing to a CDB file that is being read by another CDB instance can
 460 | cause corruption of data and general nasty things! Do not do it!
 461 | 
 462 | As such, a database can only be opened up in read only, or write only
 463 | mode.
 464 | 
 465 | The "file" parameter is passed to the "open" callback, which is present
 466 | in the "ops" parameter.
 467 | 
 468 | 		void *(*open)(const char *name, int mode);
 469 | 
 470 | The callback should return an opaque pointer on success and NULL on failure.
 471 | It is used to open up a handle to the database via whatever method the
 472 | library user would like (for example, a simple file present in your file
 473 | system, or a section of flash in an embedded computer). The open callback
 474 | is used by "cdb\_open" and should not be called directly.
 475 | 
 476 | The "mode" parameter to the "open" callback will be set to "CDB\_RW\_MODE" if
 477 | "create" is non-zero, and will be set to "CDB\_RO\_MODE" if it is zero.
 478 | 
 479 | CDB\_RW\_MODE is an enumeration that has the value "1", whilst
 480 | CDB\_RW\_MODE has the value "0".
 481 | 
 482 | "cdb\_open" does quite a lot, when opening a CDB file for reading the
 483 | file is *partially* verified, when opening for writing a blank first level
 484 | hash table is written to disk. If either of this fails, then opening
 485 | the database will fail.
 486 | 
 487 | The function also needs the callbacks to perform a seek to be present,
 488 | along with the callback for reading. The write callback only needs to
 489 | present when the database is opened up in write mode.
 490 | 
 491 | * cdb\_close
 492 | 
 493 | This closes the CDB database handle, the handle may be NULL, if so,
 494 | nothing will be done. The same handle should not be passed in twice
 495 | to "cdb\_close" as this can cause double-free errors. This function
 496 | will release any memory and handles (by calling the "close" callback)
 497 | associated with the handle.
 498 | 
 499 | When writing a database this function has one more task to do, and
 500 | that is finalizing the database, it writes out the hash-table at
 501 | the end of the file. If "cbd\_close" is not called after the
 502 | last entry has been added then the database will be in an invalid
 503 | state and will not work.
 504 | 
 505 | This function may return negative on error, for example if the
 506 | finalization fails.
 507 | 
 508 | After calling "cdb\_close" the handle *must not* be used again.
 509 | 
 510 | * cdb\_read
 511 | 
 512 | To be used on a database opened up in read-mode only. This can
 513 | be used to read values, and sometimes keys, from the database. This
 514 | function does not call "cdb\_seek", the caller must call "cdb\_seek"
 515 | before calling this function to move the file pointer to the
 516 | desired location before reading. The file pointer will be updated
 517 | to point to after the location that has been read (or more accurately,
 518 | the read callback must do this). This function *does not* return the
 519 | number of bytes read, instead it returns zero for no error and
 520 | negative if an error condition occurs (a partial read is treated as
 521 | an error).
 522 | 
 523 | * cdb\_add
 524 | 
 525 | To be used on a database opened up in write, or creation, mode only.
 526 | 
 527 | This function adds a key-value pair to the database, which can be
 528 | looked up only after finalizing the database (by calling "cdb\_close")
 529 | and reopening the database in read-only mode, which should be done
 530 | after the final "cdb\_add" has been added.
 531 | 
 532 | It is unfortunate that both the key and value must reside within
 533 | memory, but doing anything else would complicate the API too much.
 534 | 
 535 | One the key and value have been added they can be freed or discarded
 536 | however.
 537 | 
 538 | Adding key-value pairs consumes disk space and some extra memory
 539 | which is needed to store the second level hash table, however the
 540 | keys and values are not kept around in memory by the CDB library.
 541 | 
 542 | Note that this function will add duplicate keys without complaining,
 543 | and can add zero length keys and values, likewise without complaining.
 544 | 
 545 | It is entirely up to the caller to prevent duplicates from being
 546 | added. This is one improvement that could be added to the library (as
 547 | you cannot check or query a partially written database at the
 548 | moment).
 549 | 
 550 | 
 551 | * cdb\_seek
 552 | 
 553 | This function changes the position that the next read or write
 554 | will occur from. You should not seek before or after the database,
 555 | doing so will result in an error. Seeking is always relative to the
 556 | start of the file, the optional offset specified in the CDB options
 557 | structure being added to the current position. Relative to current
 558 | position or file-end seeks cannot be done.
 559 | 
 560 | This function must be called before each call to "cdb\_read" or
 561 | "cdb\_read\_word\_pair", otherwise you may read garbage.
 562 | 
 563 | Calling "cdb\_seek" multiple times on the same location has no
 564 | effect (the "fseek" C standard library function may discard buffers
 565 | if called multiple times on the same location even though the file
 566 | position has not changed).
 567 | 
 568 | * cdb\_foreach
 569 | 
 570 | The "cdb\_foreach" function calls a callback for each value within
 571 | the CDB database. The callback is passed an optional "param". If
 572 | the callback returns negative or a non-zero number then the for-each
 573 | loop is terminated early (a positive number is returned, a negative
 574 | number results in -1 being returned). If the callback returns zero
 575 | then the next value, if any, is processed with the callback being
 576 | called again.
 577 | 
 578 | The callback is passed a structure which contains the location
 579 | within the CDB database that contains the key and value. The keys
 580 | and values are not presented in any specific order and the order
 581 | should not be expected to stay the same between calls.
 582 | 
 583 | To read either a key or a value you must call "cdb\_seek" before
 584 | calling "cdb\_read" yourself.
 585 | 
 586 | Passing in NULL is allowed and is not a No-Operation, it can be
 587 | used to effectively check the integrity of the database.
 588 | 
 589 | * cdb\_read\_word\_pair
 590 | 
 591 | To be used on a database opened up in read-mode only. This function
 592 | is a helper function that strictly does not need to exist, it is
 593 | used for reading two "cdb\_word\_t" values from the database. This
 594 | can be useful for the library user for more detailed analysis of
 595 | the database than would normally be possible, many values within
 596 | the database are stored as two "cdb\_word\_t" values. Looking inside this
 597 | read-only database is not discouraged and the file format is well
 598 | documented.
 599 | 
 600 | This function does not call "cdb\_seek", that must be called
 601 | before hand to seek to the desired file location. The file position
 602 | will be updated to point after the two read values.
 603 | 
 604 | * cdb\_get
 605 | 
 606 | This function populates the "value" structure if the "key" is found
 607 | within the CDB database. The members of "value" will be set to zero
 608 | if a key is not found, if it is found the position will be non-zero,
 609 | although the length may be zero.
 610 | 
 611 | Note that this function does not actually retrieve the key and put it
 612 | into a buffer, there is a very good reason for that. It would be easy
 613 | enough to make such a function given the functions present in this
 614 | API, however in order to make such a function it would have to do
 615 | the following; allocate enough space to store the value, read the
 616 | value off of disk and then return the result. This has massive performance
 617 | implications. Imagine if a large value is stored in the database, say
 618 | a 1GiB value, this would mean at least 1GiB of memory would need to
 619 | be allocated, it would also mean all of the file buffers would have
 620 | been flushed and refilled, and all of that data would need to be copied
 621 | from disk to memory. This might be desired, it might also be *very*
 622 | wasteful, especially if only a fraction of the value is actually
 623 | needed (say the first few hundred bytes). Whether this is wasteful
 624 | depends entirely on your workload and use-cases for the database.
 625 | 
 626 | It is better to give the user tools to do what they need than insisting
 627 | it be done one, limiting, although "easy", way.
 628 | 
 629 | This does mean that to actually retrieve the value the user must
 630 | perform their own "cdb\_seek" and "cdb\_read" operations. This
 631 | means that the entire value does not need to read into memory
 632 | be the consumer, and potentially be processed block by block by
 633 | the "read" callback if needed.
 634 | 
 635 | * cdb\_lookup
 636 | 
 637 | "cdb\_lookup" is similar to "cdb\_get" except it accepts an
 638 | optional record number. Everything that applies to the get-function
 639 | applies to the lookup-function, the only difference is the record
 640 | number argument (internally "cdb\_get" is implemented with
 641 | "cdb\_lookup").
 642 | 
 643 | If there are two or more keys that are identical then the question
 644 | of how to select a specific key arises. This is done with an
 645 | arbitrary number that will most likely, but is not guaranteed, to
 646 | be the order in which the key was added into the database, with the
 647 | first value being zero and the index being incremented from there
 648 | on out.
 649 | 
 650 | If the key is found but the index is out of bounds it is treated
 651 | as if the key does not exist. Use "cdb\_count" to calculate the
 652 | maximum number records per key if needed, it is far more expensive
 653 | to repeatedly call "cdb\_lookup" on a key until it returns "key
 654 | not found" to determine the number of duplicate keys than it is
 655 | to call "cdb\_count".
 656 | 
 657 | The index argument perhaps should be a "cdb\_word\_t", but there
 658 | is always debate around these topics (personally if I were to
 659 | design a C-like programming language everything integers would default
 660 | to 64-bits and all pointers would fit within that, other types
 661 | for indexing and the like would also be 64-bit, that's not a
 662 | criticism of C, the madness around integer types was born out
 663 | of necessity).
 664 | 
 665 | * cdb\_count
 666 | 
 667 | The "cdb\_count" function counts the number of entries that
 668 | have the same key value. This function requires potentially multiple
 669 | seeks and reads to compute, so the returned value should be cached if
 670 | you plan on using it again as the value is expensive to calculate.
 671 | 
 672 | If the key is not found, a value indicating that will be returned
 673 | and the count argument will be zeroed. If found, the count will
 674 | be put in the count argument.
 675 | 
 676 | * cdb\_status
 677 | 
 678 | This function returns the status of the CDB library handle. All
 679 | errors are sticky in this library, if an error occurs when handling
 680 | a CDB database then there is no way to clear that error short of
 681 | reopening the database with a new handle. The only valid operation
 682 | to do after getting an error from any of the functions that operate
 683 | on a "cdb\_t" handle is to call "cdb\_status" to query the error
 684 | value that is stored internally.
 685 | 
 686 | "cdb\_status" should return a zero on no error and a negative value
 687 | on failure. It should not return a positive non-zero value.
 688 | 
 689 | * cdb\_version
 690 | 
 691 | "cdb\_version" returns the version number of the library. It stores
 692 | the value in an unsigned long. This may return an error value and a
 693 | zero value if the version has not been set correctly at compile time.
 694 | 
 695 | The value is stored in "MAJOR.MINOR.PATH" format, with "PATH" stored
 696 | in the Least Significant Byte. This is a semantic version number. If
 697 | the "MAJOR" number has changed then there are potentially breaking
 698 | changes in the API or ABI of this library that have been introduced,
 699 | no matter how trivial.
 700 | 
 701 | * cdb\_tests
 702 | 
 703 | And the callback for "cdb\_foreach":
 704 | 
 705 | * "cdb\_callback"
 706 | 
 707 | This callback is called for each value within the CDB database
 708 | when used with "cdb\_foreach". If a negative value is returned from
 709 | this callback then the foreach loop will end early and an error value
 710 | will be returned. If the value returned is greater than zero then
 711 | the foreach loop will terminate potentially early. If zero the
 712 | foreach loop will continue to the next key-value pair if available.
 713 | 
 714 | Each time this callback is called by "cdb\_foreach" it will be
 715 | passed in a key-value pair in the form of two length/file-location
 716 | structures. You will need to seek to those locations and call
 717 | read the key-values yourself. There is no guarantee the file position
 718 | is in the correct location (ie. Pointing to the location of the
 719 | key), so call "cdb\_seek" before calling "cdb\_read".
 720 | 
 721 | There is no guarantee that the key-value pairs will be presented
 722 | in the same order each time the function is called and should not
 723 | be counted on. There is no attempt to preserve order.
 724 | 
 725 | See "cdb\_foreach" for more information.
 726 | 
 727 | ## C API STRUCTURES
 728 | 
 729 | The C API has two simple structures and one complex one, the latter being
 730 | more of a container for callbacks (or, some might say, a way of doing
 731 | object oriented programming in C). The complex structure, "cdb\_options\_t",
 732 | is an unfortunate necessity.
 733 | 
 734 | The other two structures, "cdb\_buffer\_t" and "cdb\_file\_pos\_t", are
 735 | simple enough and need very little explanation, although they will be.
 736 | 
 737 | Let us look at the "cdb\_options\_t" structure:
 738 | 
 739 | 	typedef struct {
 740 | 		void *(*allocator)(void *arena, void *ptr, size_t oldsz, size_t newsz);
 741 | 		cdb_word_t (*hash)(const uint8_t *data, size_t length);
 742 | 		int (*compare)(const void *a, const void *b, size_t length);
 743 | 		cdb_word_t (*read)(void *file, void *buf, size_t length);
 744 | 		cdb_word_t (*write)(void *file, void *buf, size_t length);
 745 | 		int (*seek)(void *file, uint64_t offset);
 746 | 		void *(*open)(const char *name, int mode);
 747 | 		int (*close)(void *file);
 748 | 		int (*flush)(void *file);
 749 | 
 750 | 		void *arena;
 751 | 		cdb_word_t offset;
 752 | 		unsigned size;
 753 | 	} cdb_options_t;
 754 | 
 755 | Each member of the structure will need an explanation.
 756 | 
 757 | ## STRUCTURE CALLBACKS
 758 | 
 759 | * allocator
 760 | 
 761 | This function is based off of the allocator callback mechanism
 762 | present in Lua, see <https://www.lua.org/manual/5.1/manual.html#lua_setallocf>
 763 | for more information on that allocator. This function can handle
 764 | freeing memory, allocating memory, and reallocating memory, all
 765 | in one function. This allows the user of this library to specify
 766 | where objects are allocated and how.
 767 | 
 768 | The arguments to the callback mean:
 769 | 
 770 | 1. arena
 771 | 
 772 | This may be NULL, it is an optional argument that can be used
 773 | to store memory allocation statistics or as part of an arena
 774 | allocator.
 775 | 
 776 | 2. ptr
 777 | 
 778 | This should be NULL if allocating new memory, of be a pointer
 779 | to some previously allocated memory if freeing memory or
 780 | reallocating it.
 781 | 
 782 | 3. oldsz
 783 | 
 784 | The old size of the pointer if known, if unknown, use zero. This is
 785 | used to prevent unnecessary allocations.
 786 | 
 787 | 4. newz
 788 | 
 789 | The new size of the desired pointer, this should be non-zero
 790 | if reallocating or allocating memory. To free memory set this
 791 | to zero, along with providing a pointer to free. If this is zero
 792 | and the "ptr" is NULL then nothing will happen.
 793 | 
 794 | 5. The return value
 795 | 
 796 | This will be NULL on failure if allocating memory or reallocating
 797 | memory and that operation failed. It will be non-NULL on success,
 798 | containing usable memory. If freeing memory this should return NULL.
 799 | 
 800 | An example allocator using the built in allocation routines is:
 801 | 
 802 | 	void *allocator_cb(void *arena, void *ptr, size_t oldsz, size_t newsz) {
 803 | 		UNUSED(arena);
 804 | 		if (newsz == 0) {
 805 | 			free(ptr);
 806 | 			return NULL;
 807 | 		}
 808 | 		if (newsz > oldsz)
 809 | 			return realloc(ptr, newsz);
 810 | 		return ptr;
 811 | 	}
 812 | 
 813 | This callback is both simple and flexible, and more importantly
 814 | puts the control of allocating back to the user (I know I have
 815 | repeated this *many* times throughout this document, but it is
 816 | worth repeating!).
 817 | 
 818 | 	compare: /* key comparison function: NULL defaults to memcmp */
 819 | 	write: https://roboquill.io/
 820 | 	flush: /* (optional) called at end of successful creation */
 821 | 
 822 | 	arena:   /* used for 'arena' argument for the allocator, can be NULL if allocator allows it */
 823 | 	offset: /* starting offset for CDB file if not at beginning of file */
 824 | 	size:  /* Either 0 (same as 32), 16, 32 or 64, but cannot be bigger than 'sizeof(cdb_word_t)*8' */
 825 | 
 826 | * hash (optional)
 827 | 
 828 | The "hash" callback can be set to NULL, if that is the case then
 829 | the default hash, based off of djb2 and present in the original
 830 | CDB library, will be used. If you do provide your own hash function
 831 | you will effectively make this database incompatible with the standard
 832 | CDB format but there are valid reasons for you do do this, you might
 833 | need a stronger hash that is more resistant to denial of service attacks,
 834 | or perhaps you want similar keys to *collide* more to group them together.
 835 | 
 836 | The hash function returns "cdb\_word\_t" so the number of bits this
 837 | function returns is dependent on big that type is (determined at
 838 | compile time).
 839 | 
 840 | * compare (optional)
 841 | 
 842 | This function compares keys for a match, the function should behave like
 843 | [memcmp][], returning the same values on a match and a failure. You
 844 | may want to change this function if you want to compare keys partially,
 845 | however you will also need to change the hash function to ensure keys are
 846 | sorted into the right 256 buckets for your comparison (for example, with
 847 | the default hash function two keys with the same prefix could be stored in
 848 | two separate buckets).
 849 | 
 850 | ### FILE CALLBACKS
 851 | 
 852 | The following callbacks act in a similar way to the file functions present
 853 | in [stdio.h][]. The only function missing is an [ftell][] equivalent.
 854 | 
 855 | * read
 856 | 
 857 | This function is used to read data out of the database, wherever that
 858 | data is stored. Unlike [fread][] a status code is returned instead of
 859 | the length of the data read, negative indicating failure. A partial read
 860 | should result in a failure. The only thing lacking from this callback
 861 | is a way to signal to perform non-blocking Input and Output, that would
 862 | complicate the internals however. The "read" callback should always be
 863 | present.
 864 | 
 865 | The first parameter, "file", is a handle to an object returned by the
 866 | "open" callback.
 867 | 
 868 | The callback should return 0 indicating no error if "length" bytes have
 869 | been read into "buf".
 870 | 
 871 | Reading should continue from the previous file pointer position, that
 872 | is if you open a file handle, read X bytes, the next time you read Y
 873 | bytes they should be read from the end of the X bytes and not the
 874 | beginning of the file (hence why read does not take a file position).
 875 | 
 876 | If implementing read callbacks in an embedded system you might have to
 877 | also implement that behavior.
 878 | 
 879 | * write (conditionally optional, needed for database creation only)
 880 | 
 881 | Similar to the "read" callback, but instead writes data into wherever
 882 | the database is stored.
 883 | 
 884 | * seek
 885 | 
 886 | This callback sets the file position that subsequent reads and writes
 887 | occur from.
 888 | 
 889 | * open
 890 | 
 891 | This callback should open the resource specified by the "name" string
 892 | (which will usually be a file name). There are two modes a read/write
 893 | mode (used to create the database) and a read-only mode. This callback
 894 | much like the "close" callback will only be called once internally
 895 | by the CDB library.
 896 | 
 897 | * close
 898 | 
 899 | This callback should close the file handle returned by "open", freeing
 900 | any resources associated with that handle.
 901 | 
 902 | * flush (optional)
 903 | 
 904 | An optional callback used for flushing writes to mass-storage. If NULL
 905 | then the function will not be called.
 906 | 
 907 | ## STRUCTURE VARIABLES
 908 | 
 909 | * arena (optional, can be NULL, depends on your allocator)
 910 | 
 911 | This value is passed into the allocator as the "arena" argument whenever
 912 | the allocator is called. It can be NULL, which will usually be the case
 913 | if you are just using "malloc", "realloc" and "free" to implement the
 914 | allocator, but if you are implementing your own arena based allocator you
 915 | might want to set it to point to your arena (hence the name).
 916 | 
 917 | * offset
 918 | 
 919 | This offset can be used for CDB databases embedded within a file. If
 920 | the CDB database does not begin at the start of the file (or flash, or
 921 | wherever) then you can set this offset to skip over that many number
 922 | of bytes in the file.
 923 | 
 924 | * size
 925 | 
 926 | The size variable, which can be left at zero, is used to select
 927 | the word size of the database, this has an interaction with "cdb\_word\_t".
 928 | 
 929 | Missing perhaps is a unsigned field that could contain options
 930 | in each bit position in that field.
 931 | 
 932 | 
 933 | ## BUFFER STRUCTURE
 934 | 
 935 | 	typedef struct {
 936 | 		cdb_word_t length; /* length of data */
 937 | 		char *buffer;      /* pointer to arbitrary data */
 938 | 	} cdb_buffer_t; /* used to represent a key or value in memory */
 939 | 
 940 | ## FILE POSITION STRUCTURE
 941 | 
 942 | 	typedef struct {
 943 | 		cdb_word_t position; /* position in file, for use with cdb_read/cdb_seek */
 944 | 		cdb_word_t length;   /* length of data on disk, for use with cdb_read */
 945 | 	} cdb_file_pos_t; /* used to represent a value on disk that can be accessed via 'cdb_options_t' */
 946 | 
 947 | ## EMBEDDED SUITABILITY
 948 | 
 949 | There are many libraries written in C, for better or worse, as it is the
 950 | lingua franca for software development at the moment. Few of those libraries
 951 | are directly suitable for use in [Embedded systems][] and are much less
 952 | flexible than they could be in general. Embedded systems pose some interesting
 953 | constraints (eschewing allocation via "malloc", lack of a file-system, and
 954 | more). By designing the library for an embedded system we can make a library
 955 | more useful not only for those systems but for hosted systems as well (eg. By
 956 | providing callbacks for the FILE functions we can redirect them to wherever
 957 | we like, the CDB file could be stored remotely and accessed via TCP, or it
 958 | could be stored locally using a normal file, or it could be stored in memory).
 959 | 
 960 | There are two sets of functions that should be abstracted out in nearly
 961 | every library, memory allocation (or even better, the caller can pass in
 962 | fixed length structures if possible) and Input/Output functions (including
 963 | logging!). This library does both.
 964 | 
 965 | There is one area in which the library is lacking, the I/O functions do not
 966 | yield if there is nothing to read yet, or a write operation is taking too
 967 | long. This does impose constraints on the caller and how the library is used
 968 | (all calls to the library could block for an arbitrary length of time). The
 969 | callbacks could return a status indicating the caller should yield, but
 970 | yielding and restoring state to enable partially completed I/O to finish
 971 | would greatly complicate the library (this would be trivial to implement if
 972 | C had portable coroutines built into the language).
 973 | 
 974 | More libraries should be written with this information in mind.
 975 | 
 976 | ## TEST SUITE
 977 | 
 978 | There is a special note that should be mentioned about how the test suite
 979 | is handled as it is important.
 980 | 
 981 | It is difficult to make a good API that is easy to use, consistent, and
 982 | difficult to *misuse*. Bad APIs abound in common and critical software
 983 | (names will not be named) and can make an already difficult to use language
 984 | like C even more difficult to use.
 985 | 
 986 | One mistake that is often seen is API functionality that is conditional
 987 | upon an macro. This complicates the build system along with every piece of
 988 | software that is dependent on those optional calls. The most common function
 989 | to be optionally compiled in are test suite related functions if they are
 990 | present. For good reason these test suites might need to be removed from builds
 991 | (as they might take up large amounts of space for code even if they are not
 992 | needed, which is at a premium in embedded systems with limited flash memory).
 993 | 
 994 | The header often contains code like this:
 995 | 
 996 | 	#ifdef LIBRARY_UNIT_TESTS
 997 | 	int library_unit_tests(void);
 998 | 	#endif
 999 | 
1000 | And the code like this, in C like pseudo-code:
1001 | 
1002 | 	#ifdef LIBRARY_UNIT_TESTS
1003 | 	int test_function_1(void) {
1004 | 		/* might call malloc directly, making this unsuitable
1005 | 		to be included in an embedded system */
1006 | 		return result;
1007 | 	}
1008 | 
1009 | 	int library_unit_tests(void) {
1010 | 		/* tests go here */
1011 | 		if (test_function_1() != OK)
1012 | 			return FAIL;
1013 | 		return PASS;
1014 | 	}
1015 | 	#endif
1016 | 
1017 | 
1018 | In order to call this code you need to be aware of the "LIBRARY\_UNIT\_TESTS"
1019 | macro each time the function "library\_unit\_tests" is called, and worse,
1020 | whether or not your library was compiled with that macro enabled resulting
1021 | in link-time errors. Another common mistake is not passing in the functions
1022 | for I/O and allocation to the unit test framework, making it unsuitable for
1023 | embedded use (but that is a common criticism for many C libraries and not
1024 | just unit tests).
1025 | 
1026 | Compare this to this libraries way of handling unit tests:
1027 | 
1028 | In the header:
1029 | 
1030 | 	int cdb_tests(const cdb_options_t *ops, const char *test_file);
1031 | 
1032 | And the *relevant* bits of code/pseudo-code:
1033 | 
1034 | 	static uint64_t xorshift128(uint64_t s[2]) {
1035 | 		assert(s);
1036 | 		/* XORSHIFT-128 algorithm */
1037 | 		return NEXT_PRNG;
1038 | 	}
1039 | 
1040 | 
1041 | 	int cdb_tests(const cdb_options_t *ops, const char *test_file) {
1042 | 		assert(ops);
1043 | 		assert(test_file);
1044 | 		BUILD_BUG_ON(sizeof (cdb_word_t) < 2);
1045 | 
1046 | 		if (CDB_TESTS_ON == 0)
1047 | 			return CDB_OK_E;
1048 | 
1049 | 		/* LOTS OF TEST CODE NOT SHOWN, some of which
1050 | 		uses "xorshift128". */
1051 | 
1052 | 		return STATUS;
1053 | 	}
1054 | 
1055 | There is no "ifdef" surrounding any of the code (using "ifdef" anywhere to
1056 | conditionally execute code is usually a mistake, is only used within the
1057 | project to set default macro values if the macro is not previously
1058 | defined, an acceptable usage).
1059 | 
1060 | Two things are important here, the first, all of the Input and Output
1061 | and memory related functions are passed in via the "ops" structure,
1062 | as mentioned. This means that the test code is easy to port and run on
1063 | a microcontroller which might not have a file system (for testing and
1064 | development purposes you might want to run the tests on a microcontroller
1065 | but not keep them in in the final product).
1066 | 
1067 | The main difference is the lack of "ifdef" guards, instead if the macro
1068 | "CDB\_TESTS\_ON" is false the function "cdb\_tests" returns "CDB\_OK\_E"
1069 | (there is some debate if the return code should be this, or something
1070 | to indicate the tests are not present, but that is a separate issue, the
1071 | important bit is the return depending on whether the tests are present).
1072 | 
1073 | This "if" statement is a *far superior* way of handling optional code in
1074 | general. The caller does not have to worry if the function is present or
1075 | not, as the function will always be present in the library. Not only that,
1076 | but if the tests are not run because the compile time macro "CDB\_TESTS\_ON"
1077 | is false then the compiler will optimize out those tests even on the lowest
1078 | optimization settings (on any decent compiler).
1079 | 
1080 | This also has the advantage that the code that is not run still goes
1081 | through the compilation step meaning the code is less likely to be wrong
1082 | when refactoring code. Not only that, but because "xorshift128" which
1083 | "cdb\_tests" depends on, is declared to be static, if "CDB\_TESTS\_ON" is
1084 | false it to will be eliminated from the compiled object file so long as no
1085 | other function calls it. In actual fact, the code has changed since
1086 | this has been written and "cdb\_prng" is exposed in the header as it is
1087 | useful in [main.c][], which is equivalent to "xorshift128".
1088 | 
1089 | # BUILD REQUIREMENTS
1090 | 
1091 | If you are building the program from the repository at
1092 | <https://github.com/howerj/cdb> you will need [GNU Make][] and a [C
1093 | Compiler][].  The library is written in pure [C99][] and should be fairly
1094 | simple to port to another platform. Other [Make][] implementations may
1095 | work, however they have not been tested. [git][] is also used as part of
1096 | the build system.
1097 | 
1098 | First clone the repository and change directory to the newly clone repository:
1099 | 
1100 | 	git clone https://github.com/howerj/cdb cdb
1101 | 	cd cdb
1102 | 
1103 | Type 'make' to build the *cdb* executable and library.
1104 | 
1105 | Type 'make test' to build and run the *cdb* internal tests. The script called
1106 | 't', written in [sh][], does more testing, and tests that the user interface
1107 | is working correctly. 'make dist' is used to create a compressed tar file for
1108 | distribution. 'make install' can be used to install the binaries, however the
1109 | default installation directory (which can be set with the 'DESTDIR' makefile
1110 | variable) installs to a directory called 'install' within the repository -
1111 | it will not actually install anything. Changing 'DESTDIR' to '/usr' should
1112 | install everything properly. [pandoc][] is required to build the manual page
1113 | for installation, which is generated from this [markdown][] file.
1114 | 
1115 | Look at the source file [cdb.c][] to see what compile time options can be
1116 | passed to the compiler to enable and disable features (if code size is a
1117 | concern then the ability to create databases can be removed, for example).
1118 | 
1119 | # RENAME
1120 | 
1121 | CDB databases are meant to be read-only, in order to add entries to
1122 | a database that database should be dumped and new values added in along
1123 | with the old ones. That is, to add in a new value to the database the
1124 | entire database has to be rebuilt. This is not a problem for *some* work
1125 | loads, for *some* work loads the database could be rebuilt every X hours.
1126 | 
1127 | If this does present a problem, then you should not use this database.
1128 | 
1129 | However, when a database does have to be rebuilt how do you make sure
1130 | that users of it point to the new database and not the old one?
1131 | 
1132 | If you access the database via the command line applications then
1133 | the "[rename][]" function, which is atomic on POSIX systems, will do
1134 | what is needed. This is, a mechanism to swap out the old database with
1135 | a new one without affecting any of the current readers.
1136 | 
1137 | A rename can be done in C like so:
1138 | 
1139 | 	rename("new.cdb", "current.cdb"); /* Atomic rename */
1140 | 
1141 | If a reader opens "current.cdb" before the rename then it will continue
1142 | to read the old database until it closes the handle and opens up "current.cdb"
1143 | after the rename. The files data persists even if there is no file name that
1144 | points to it so long as there are active users of that file (ie. If a file
1145 | handle to that file is still open). This will mean that there could be
1146 | processes that use old data, but not inconsistent data. If a reader opens
1147 | up the data after the rename, it will get the new data.
1148 | 
1149 | This also means that the writer should never write to a file that is
1150 | currently in use by other readers or writers, it should write to a new
1151 | file that will be renamed to the file in use, and it also means that a
1152 | large amount of disk storage space will be in use until all users of
1153 | the old databases switch to the new databases allowing the disk space
1154 | to be reclaimed by the operating system.
1155 | 
1156 | # POSSIBLE DIRECTIONS
1157 | 
1158 | There are many additions that could be made to a project, however the
1159 | code is quite compact and neat, anything else that is needed could be built
1160 | on top of this library. Some ideas for improvement include; adding a header
1161 | along with a [CRC][], adding (unsafe) functions for rewriting key-values,
1162 | adding (de)compression (with the [shrink][] library) and decryption,
1163 | integrating the project in an embedded system in conjunction with [littlefs][]
1164 | as an example, allowing the user to supply their own comparison and hash
1165 | functions, adding types and schemas to the database, and more. The project
1166 | could also be used as the primary database library for the [pickle][]
1167 | interpreter, or for serving static content in the [eweb][] web-server.
1168 | 
1169 | All of these would add complexity, and more code - making it more useful
1170 | to some and less to others. As such, apart from bugs, the library and test
1171 | driver programs should be considered complete.
1172 | 
1173 | The lack of a header might be solved in creative ways as:
1174 | 
1175 | * The integrity of most of the file can be checked by making sure all pointers are
1176 |   within bounds, that key-value pairs are stored one after another and that
1177 |   each key is in the right bucket for that hash. The only things not checked
1178 |   would be the values (they would still have to be of the right length).
1179 | * If a file successfully passes a verification it can be identified as a valid
1180 |   CDB file of that size, this means we would not need to store header
1181 |   information about the file type and structure. This has been verified
1182 |   experimentally (the empty and randomly generated databases of a different
1183 |   size do not pass verification when the incorrect size is specified with
1184 |   the "-b" option).
1185 | * We could place the header within the key-value section of the database, or
1186 |   even at the end of the file.
1187 | 
1188 | Things that *should* and *could* be done, but have not:
1189 | 
1190 | * Fuzzing with [American Fuzzy Lop][] to iron out the most egregious
1191 | bugs, security relevant or otherwise. This has been used on the [pickle][]
1192 | library to great effect and it finds bugs that would not be caught be unit
1193 | testing alone. **The library is currently undergoing fuzzing, nothing
1194 | bad found so far**.
1195 | * The current library implements a system for looking up data
1196 | stored to disk, a *system* could be created that does so much more.
1197 | Amongst the things that could be done are:
1198 |   - Using the CDB file format only as a serialization format
1199 |   for an in memory database which would allow key deletion/replacing.
1200 |   This Key-Value store would essentially just be an in memory hash
1201 |   table with a fancy name, backed by this library. The project could
1202 |   be done as part of this library or as a separate project.
1203 |   - Implementing the [memcached protocol][] to allow remote querying
1204 |   of data.
1205 |   - Alternatively make a custom protocol that accept commands over
1206 |   UDP.
1207 | There are a few implementation strategies for doing this.
1208 | * Alternatively, just a simple Key-Value store that uses this database
1209 | as a back-end without anything else fancy.
1210 | * Changing the library interface so it is a [header only][] C library.
1211 | * Making a set of callbacks to allow an in memory CDB database, useful
1212 | for embedding the database within binaries.
1213 | * Designing a suite of benchmarks for similar databases and implementations
1214 | of CDB, much like <https://docs.huihoo.com/qdbm/benchmark.pdf>.
1215 | 
1216 | Porting this to Rust and making a crate for it would be nice,
1217 | [although implementations already exists](https://crates.io/search?q=cdb).
1218 | Just making bindings for this library would be a good initial step, along
1219 | with other languages.
1220 | 
1221 | For more things that are possible to do:
1222 | 
1223 | * The API supplies a for-each loop mechanism where the user supplies a
1224 | callback, an iterator based solution would be more flexible (but slightly
1225 | more error prone to use).
1226 | * The user can specify their own hash algorithm, using one with perhaps
1227 | better characteristics for their purposes (and breaking compatibility
1228 | with the original format). One interesting possibility is using a hashing
1229 | algorithm that maximizes collisions of similar keys, so similar keys are
1230 | grouped together which may be useful when iterating over the database. 
1231 | Unfortunately the initial 256 wide bucket system interferes with this, 
1232 | which could be remedied by returning zero for lowest eight bits, degrading 
1233 | performance. It is not really viable to do this with this system, but
1234 | hashing algorithms that maximize collisions, such as [SOUNDEX][], are
1235 | interesting and deserve a mention. This could be paired with a user
1236 | supplied comparison function for comparing the keys themselves.
1237 | * The callbacks for the file access words ("open", "read", ...) deserve
1238 | their own structure so it can be reused, as the allocator can, although
1239 | it may require some changes to how those functions work (such as different
1240 | return values, passing in a handle to arbitrary user supplied data, and
1241 | more).
1242 | * Options for making the file checking more lax, as information could
1243 | be stored between the different key/value pairs making the file format
1244 | semi-compatible between implementations. This could be information usually
1245 | stored in the header, or information about the key/values themselves (such
1246 | as type information). Some implementations, including this one, are
1247 | more strict in what they accept.
1248 | * Some of the functions in [main.c][] could be moved into [cdb.c][] so
1249 | users do not have to reimplement them.
1250 | * A poor performance [Bloom Filter][] like algorithm can be made 
1251 | using the first level hash table. A function to return whether an
1252 | item may be in the set or is definitely not can be made by checking
1253 | whether there are any items in the first 256 bucket that key hashes
1254 | to. The 256 bucket is small enough to fit in memory, as are the second
1255 | level hash tables which could be used to improve performance even more.
1256 | * If the user presorts the keys when adding the data then the keys can
1257 | be retrieved in order using the "foreach" API call. The user could sort
1258 | on the data instead if they like.
1259 | * The way version information is communicated within the API is not
1260 | perhaps the best way of doing it. A simple macro would suffice.
1261 | * The file format really could use a redesign. One improvement apart
1262 | from adding a header would be to move the 256 bucket initial hash table
1263 | to the end of the file so the entire file format could be streamed to
1264 | disk.
1265 | 
1266 | # BUGS
1267 | 
1268 | For any bugs, email the [author][]. It comes with a 'works on my machine
1269 | guarantee'. The code has been written with the intention of being portable,
1270 | and should work on 32-bit and 64-bit machines. It is tested more frequently
1271 | on a 64-bit Linux machine, and less frequently on Windows. Please give a
1272 | detailed bug report (including but not limited to what machine/OS you are
1273 | running on, compiler, compiler version, a failing example test case, your
1274 | blood type and star sign, etcetera).
1275 | 
1276 | # PYTHON IMPLEMENTATION
1277 | 
1278 | Available from here
1279 | <https://www.unixuser.org/~euske/doc/cdbinternals/index.html>. It
1280 | probably is the most succinct description and understandable by someone
1281 | not versed in python.
1282 | 
1283 | 	#!/usr/bin/env python
1284 | 
1285 | 	# Python implementation of cdb
1286 | 
1287 | 	# calc hash value with a given key
1288 | 	def calc_hash(s):
1289 | 	  return reduce(lambda h,c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381)
1290 | 
1291 | 	# cdbget(fp, basepos, key)
1292 | 	def cdbget(fp, pos_header, k):
1293 | 	  from struct import unpack
1294 | 
1295 | 	  r = []
1296 | 	  h = calc_hash(k)
1297 | 
1298 | 	  fp.seek(pos_header + (h % 256)*(4+4))
1299 | 	  (pos_bucket, ncells) = unpack('<LL', fp.read(4+4))
1300 | 	  if ncells == 0: raise KeyError
1301 | 
1302 | 	  start = (h >> 8) % ncells
1303 | 	  for i in range(ncells):
1304 | 	    fp.seek(pos_bucket + ((start+i) % ncells)*(4+4))
1305 | 	    (h1, p1) = unpack('<LL', fp.read(4+4))
1306 | 	    if p1 == 0: raise KeyError
1307 | 	    if h1 == h:
1308 | 	      fp.seek(p1)
1309 | 	      (klen, vlen) = unpack('<LL', fp.read(4+4))
1310 | 	      k1 = fp.read(klen)
1311 | 	      v1 = fp.read(vlen)
1312 | 	      if k1 == k:
1313 | 		r.append(v1)
1314 | 		break
1315 | 	  else:
1316 | 	    raise KeyError
1317 | 
1318 | 	  return r
1319 | 
1320 | 
1321 | 	# cdbmake(filename, hash)
1322 | 	def cdbmake(f, a):
1323 | 	  from struct import pack
1324 | 
1325 | 	  # write cdb
1326 | 	  def write_cdb(fp):
1327 | 	    pos_header = fp.tell()
1328 | 
1329 | 	    # skip header
1330 | 	    p = pos_header+(4+4)*256  # sizeof((h,p))*256
1331 | 	    fp.seek(p)
1332 | 
1333 | 	    bucket = [ [] for i in range(256) ]
1334 | 	    # write data & make hash
1335 | 	    for (k,v) in a.iteritems():
1336 | 	      fp.write(pack('<LL',len(k), len(v)))
1337 | 	      fp.write(k)
1338 | 	      fp.write(v)
1339 | 	      h = calc_hash(k)
1340 | 	      bucket[h % 256].append((h,p))
1341 | 	      # sizeof(keylen)+sizeof(datalen)+sizeof(key)+sizeof(data)
1342 | 	      p += 4+4+len(k)+len(v)
1343 | 
1344 | 	    pos_hash = p
1345 | 	    # write hashes
1346 | 	    for b1 in bucket:
1347 | 	      if b1:
1348 | 		ncells = len(b1)*2
1349 | 		cell = [ (0,0) for i in range(ncells) ]
1350 | 		for (h,p) in b1:
1351 | 		  i = (h >> 8) % ncells
1352 | 		  while cell[i][1]:  # is call[i] already occupied?
1353 | 		    i = (i+1) % ncells
1354 | 		  cell[i] = (h,p)
1355 | 		for (h,p) in cell:
1356 | 		  fp.write(pack('<LL', h, p))
1357 | 
1358 | 	    # write header
1359 | 	    fp.seek(pos_header)
1360 | 	    for b1 in bucket:
1361 | 	      fp.write(pack('<LL', pos_hash, len(b1)*2))
1362 | 	      pos_hash += (len(b1)*2)*(4+4)
1363 | 	    return
1364 | 
1365 | 	  # main
1366 | 	  fp=file(f, "wb")
1367 | 	  write_cdb(fp)
1368 | 	  fp.close()
1369 | 	  return
1370 | 
1371 | 
1372 | 	# cdbmake by python-cdb
1373 | 	def cdbmake_true(f, a):
1374 | 	  import cdb
1375 | 	  c = cdb.cdbmake(f, f+".tmp")
1376 | 	  for (k,v) in a.iteritems():
1377 | 	    c.add(k,v)
1378 | 	  c.finish()
1379 | 	  return
1380 | 
1381 | 
1382 | 	# test suite
1383 | 	def test(n):
1384 | 	  import os
1385 | 	  from random import randint
1386 | 	  a = {}
1387 | 	  def randstr():
1388 | 	    return "".join([ chr(randint(32,126)) for i in xrange(randint(1,1000)) ])
1389 | 	  for i in xrange(n):
1390 | 	    a[randstr()] = randstr()
1391 | 	  #a = {"a":"1", "bcd":"234", "def":"567"}
1392 | 	  #a = {"a":"1"}
1393 | 	  cdbmake("my.cdb", a)
1394 | 	  cdbmake_true("true.cdb", a)
1395 | 	  # check the correctness
1396 | 	  os.system("cmp my.cdb true.cdb")
1397 | 
1398 | 	  fp = file("my.cdb")
1399 | 	  # check if all values are correctly obtained
1400 | 	  for (k,v) in a.iteritems():
1401 | 	    (v1,) = cdbget(fp, 0, k)
1402 | 	    assert v1 == v, "diff: "+repr(k)
1403 | 	  # check if nonexistent keys get error
1404 | 	  for i in xrange(n*2):
1405 | 	    k = randstr()
1406 | 	    try:
1407 | 	      v = a[k]
1408 | 	    except KeyError:
1409 | 	      try:
1410 | 		cdbget(fp, 0, k)
1411 | 		assert 0, "found: "+k
1412 | 	      except KeyError:
1413 | 		pass
1414 | 	  fp.close()
1415 | 	  return
1416 | 
1417 | 	if __name__ == "__main__":
1418 | 	  test(1000)
1419 | 
1420 | This tests the python version implemented here against another python
1421 | implementation. It only implements the original 32-bit version.
1422 | 
1423 | # COPYRIGHT
1424 | 
1425 | The libraries, documentation, and the test driver program are licensed under
1426 | the [Unlicense][]. Do what thou wilt.
1427 | 
1428 | [author]: howe.r.j.89@gmail.com
1429 | [main.c]: main.c
1430 | [cdb.c]: cdb.c
1431 | [host.c]: host.c
1432 | [CDB]: https://cr.yp.to/cdb.html
1433 | [GNU Make]: https://www.gnu.org/software/make/
1434 | [C Compiler]: https://gcc.gnu.org/
1435 | [C99]: https://en.wikipedia.org/wiki/C99
1436 | [littlefs]: https://github.com/ARMmbed/littlefs
1437 | [CRC]: https://en.wikipedia.org/wiki/Cyclic_redundancy_check
1438 | [shrink]: https://github.com/howerj/shrink
1439 | [djb2]: http://www.cse.yorku.ca/~oz/hash.html
1440 | [ronn]: https://www.mankier.com/1/ronn
1441 | [pandoc]: https://pandoc.org/
1442 | [Unlicense]: https://en.wikipedia.org/wiki/Unlicense
1443 | [Make]: https://en.wikipedia.org/wiki/Make_(software)
1444 | [sh]: https://en.wikipedia.org/wiki/Bourne_shell
1445 | [git]: https://git-scm.com/
1446 | [markdown]: https://daringfireball.net/projects/markdown/
1447 | [American Fuzzy Lop]: http://lcamtuf.coredump.cx/afl/
1448 | [Semantic Version Number]: https://semver.org/
1449 | [awk]: https://en.wikipedia.org/wiki/AWK
1450 | [original cdb]: https://cr.yp.to/cdb.html
1451 | [pickle]: https://github.com/howerj/pickle
1452 | [eweb]: https://github.com/howerj/eweb
1453 | [binary file format]: https://stackoverflow.com/questions/323604
1454 | [memcached protocol]: https://raw.githubusercontent.com/memcached/memcached/master/doc/protocol.txt
1455 | [header only]: https://en.wikipedia.org/wiki/Header-only
1456 | [Embedded systems]: https://en.wikipedia.org/wiki/Embedded_system
1457 | [opaque pointer]: https://en.wikipedia.org/wiki/Opaque_pointer
1458 | [rename]: https://cplusplus.com/reference/cstdio/rename/
1459 | [memcmp]: https://cplusplus.com/reference/cstring/memcmp/
1460 | [stdio.h]: https://cplusplus.com/reference/cstdio/
1461 | [fread]: https://cplusplus.com/reference/cstdio/fread/
1462 | [ftell]: https://cplusplus.com/reference/cstdio/ftell/
1463 | [SOUNDEX]: https://en.wikipedia.org/wiki/Soundex
1464 | [Bloom Filter]: https://en.wikipedia.org/wiki/Bloom_filter
1465 | 


--------------------------------------------------------------------------------
/t:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | # CDB Test Script
  3 | #
  4 | RANDOMSRC=${RANDOMSRC:-/dev/urandom};
  5 | TESTDB=test.cdb;
  6 | EMPTYDB=empty.cdb;
  7 | FAIL=0;
  8 | PERFORMANCE=${PERFORMANCE:-test.cdb};
  9 | CDB=${CDB:-cdb};
 10 | 
 11 | performance () {
 12 | 	set -eux;
 13 | 	make test;
 14 | 
 15 | 	time -p ./cdb -s   "${PERFORMANCE}" > /dev/null;
 16 | 	time -p cdb   -s   "${PERFORMANCE}" > /dev/null;
 17 | 	time -p cdbstats < "${PERFORMANCE}" > /dev/null;
 18 | 
 19 | 	time -p ./cdb   -d "${PERFORMANCE}" > /dev/null;
 20 | 	time -p cdb     -d "${PERFORMANCE}" > /dev/null;
 21 | 	time -p cdbdump  < "${PERFORMANCE}" > /dev/null;
 22 | 
 23 | 	./cdb   -d "${PERFORMANCE}" > 1.txt;
 24 | 	cdb     -d "${PERFORMANCE}" > 2.txt;
 25 | 	cdbdump  < "${PERFORMANCE}" > 3.txt;
 26 | 
 27 | 	time -p ./cdb -c 1.cdb          < 1.txt;
 28 | 	time -p cdb   -c 2.cdb          < 2.txt;
 29 | 	time -p cdbmake  3.cdb temp.cdb < 3.txt;
 30 | }
 31 | 
 32 | usage () {
 33 | HELP=$(cat <<EOF
 34 | cdb test and performance suite
 35 | 
 36 | By default this program will run a series of tests on the various
 37 | versions of the CDB.
 38 | 
 39 | -h	print this help and exit successfully	
 40 | -p	do performance tests instead on default file
 41 | -P #	set CDB file for performance tests and run tests
 42 | 
 43 | This program will return zero and non-zero on failure.
 44 | EOF
 45 | );
 46 | 	echo "${HELP}"
 47 | }
 48 | 
 49 | while getopts 'hpP:' opt
 50 | do
 51 | 	case "${opt}" in
 52 | 		h) usage; exit 0; ;;
 53 | 		p) performance; exit 0; ;;
 54 | 		P) PERFORMANCE="${OPTARG}"; performance; exit 0; ;;
 55 | 		?) usage; exit 1; ;;
 56 | 	esac
 57 | done
 58 | 
 59 | make ${CDB};
 60 | for SIZE in 32 16 64; do
 61 | 	set -eux;
 62 | 
 63 | 	./${CDB} -b ${SIZE} -c ${EMPTYDB} <<EOF
 64 | EOF
 65 | 	./${CDB} -b ${SIZE} -t bist.cdb;
 66 | 	./${CDB} -b ${SIZE} -d bist.cdb | sort > bist.txt;
 67 | 	./${CDB} -b ${SIZE} -c copy.cdb -T temp.cdb < bist.txt;
 68 | 	./${CDB} -b ${SIZE} -d copy.cdb | sort > copy.txt;
 69 | 	diff -w bist.txt copy.txt;
 70 | 
 71 | 	./${CDB} -b ${SIZE} -c ${TESTDB} <<EOF
 72 | +0,1:->X
 73 | +1,0:X->
 74 | +1,1:a->b
 75 | +1,1:a->b
 76 | +1,1:a->c
 77 | +1,5:b->hello
 78 | +1,5:c->world
 79 | +4,7:open->seasame
 80 | EOF
 81 | 	set +x;
 82 | 
 83 | 	t() {
 84 | 		R=$(eval "${1}");
 85 | 		if [ "${R}" != "${2}" ]; then
 86 | 			echo "FAIL: '${1}' != '${2}'";
 87 | 			FAIL=1;
 88 | 		else
 89 | 			echo "ok:  '${1}' = '${2}'";
 90 | 		fi;
 91 | 	}
 92 | 
 93 | 	f() {
 94 | 		C=1
 95 | 		R=$(eval "${1}") || C=$?;
 96 | 		if [ "${R}" = "0" ]; then
 97 | 			echo "FAIL: '${1} == ${2}' expected a failure";
 98 | 			FAIL=1;
 99 | 		else
100 | 			echo "ok:  '${1}' reports failure as expected: ${C}/${R}";
101 | 		fi;
102 | 	}
103 | 
104 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} a" b;
105 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} a 0" b;
106 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} a 1" b;
107 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} a 2" c;
108 | 	f "./${CDB} -b ${SIZE} -q ${TESTDB} a 3";
109 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} X" "";
110 | 	f "./${CDB} -b ${SIZE} -q ${TESTDB} XXX";
111 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} \"\"" X;
112 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} b" hello;
113 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} c" world;
114 | 	t "./${CDB} -b ${SIZE} -q ${TESTDB} open" seasame;
115 | 
116 | 	for i in $(seq 0 9); do
117 | 		for j in $(seq 0 9); do
118 | 			for k in $(seq 0 9); do
119 | 				KEY="${i}${j}${k}"
120 | 				VAL="${i}${j}${k}"
121 | 				echo "+${#KEY},${#VAL}:${KEY}->${VAL}";
122 | 			done;
123 | 		done;
124 | 	done > seq.txt;
125 | 	echo > seq.txt
126 | 
127 | 	dd if=/dev/zero of=invalid-1.cdb count=1 # Too small
128 | 	dd if=/dev/zero of=invalid-2.cdb count=4 # Invalid hash table pointers
129 | 	#dd if=${RANDOMSRC} of=invalid-3.cdb count=512
130 | 
131 | 	f "./${CDB} -b ${SIZE} -s invalid-1.cdb"
132 | 	f "./${CDB} -b ${SIZE} -s invalid-2.cdb"
133 | 	#f "./${CDB} -s invalid-3.cdb"
134 | 	f "./${CDB} -b ${SIZE} -s /dev/null"
135 | 
136 | 	set -x
137 | 
138 | 	./${CDB} -b ${SIZE} -c seq.cdb < seq.txt;
139 | 	./${CDB} -b ${SIZE} -d seq.cdb | sort > qes.txt;
140 | 
141 | 	diff -w seq.txt qes.txt;
142 | 
143 | 	./${CDB} -b ${SIZE} -s ${EMPTYDB}
144 | 	./${CDB} -b ${SIZE} -s seq.cdb;
145 | 	./${CDB} -b ${SIZE} -s ${TESTDB}
146 | 	./${CDB} -b ${SIZE} -s bist.cdb;
147 | 
148 | 	dd if=/dev/zero of=offset.bin count=5 bs=512
149 | 	cat offset.bin test.cdb > offset.cdb
150 | 	./${CDB} -o 2560 -b ${SIZE} -V offset.cdb;
151 | 
152 | 	set +x;
153 | done;
154 | 
155 | make clean
156 | exit ${FAIL};
157 | 


--------------------------------------------------------------------------------