├── Design of CuttDB.pdf ├── src ├── cuttdb.c ├── vio_apnd2.h ├── cdb_crc64.h ├── cdb_errno.h ├── cdb_vio.c ├── cdb_bloomfilter.h ├── cdb_lock.h ├── cdb_bgtask.h ├── cdb_dumpdb.c ├── cdb_builddb.c ├── cdb_lock.c ├── cdb_errno.c ├── ae_select.c ├── ae_kqueue.c ├── cdb_bgtask.c ├── cdb_dumpraw.c ├── cdb_vio.h ├── ae_epoll.c ├── cdb_core.h ├── cdb_types.h ├── test_mt.c ├── cdb_bloomfilter.c ├── cdb_hashtable.h ├── server-thread.c ├── cuttdb.h ├── cuttdb-server.h ├── cdb_crc64.c ├── cdb_hashtable.c └── cdb_core.c ├── LICENSE ├── Makefile └── README.md /Design of CuttDB.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiyuan2010/cuttdb/HEAD/Design of CuttDB.pdf -------------------------------------------------------------------------------- /src/cuttdb.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cuttdb.h" 17 | #include "cdb_types.h" 18 | #include "cdb_vio.h" 19 | 20 | 21 | /* nothing here */ 22 | -------------------------------------------------------------------------------- /src/vio_apnd2.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _VIO_APND2_H_ 17 | #define _VIO_APND2_H_ 18 | #include "cdb_vio.h" 19 | 20 | 21 | void vio_apnd2_init(CDBVIO *vio); 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/cdb_crc64.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_CRC64_H_ 17 | #define _CDB_CRC64_H_ 18 | #include 19 | 20 | uint64_t cdb_crc64(const void *buf, uint32_t len); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/cdb_errno.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_ERRNO_H_ 17 | #define _CDB_ERRNO_H_ 18 | 19 | void cdb_seterrno(CDB *db, int ecode, const char *source, int line); 20 | 21 | #endif 22 | 23 | -------------------------------------------------------------------------------- /src/cdb_vio.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cdb_vio.h" 17 | #include "cdb_types.h" 18 | #include "vio_apnd2.h" 19 | #include "stdlib.h" 20 | 21 | 22 | CDBVIO *cdb_vio_new(int type) 23 | { 24 | CDBVIO *res; 25 | res = (CDBVIO *)malloc(sizeof(CDBVIO)); 26 | switch(type) { 27 | case CDBVIOAPND2: 28 | vio_apnd2_init(res); 29 | break; 30 | default: 31 | vio_apnd2_init(res); 32 | break; 33 | } 34 | return res; 35 | } 36 | 37 | int cdb_vio_destroy(CDBVIO *vio) 38 | { 39 | free(vio); 40 | return 0; 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/cdb_bloomfilter.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | /* 17 | Bloom Filter is currently not used in cuttdb 18 | */ 19 | #ifndef _CDB_BLOOMFILTER_H_ 20 | #define _CDB_BLOOMFILTER_H_ 21 | #include 22 | #include 23 | 24 | typedef struct CDBBLOOMFILTER CDBBLOOMFILTER; 25 | 26 | #define CDBBFRATIO 8 27 | 28 | CDBBLOOMFILTER *cdb_bf_new(uint64_t rnum, uint64_t size); 29 | void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize); 30 | bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize); 31 | void cdb_bf_clean(CDBBLOOMFILTER *bf); 32 | void cdb_bf_destroy(CDBBLOOMFILTER *bf); 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /src/cdb_lock.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_LOCK_H_ 17 | #define _CDB_LOCK_H_ 18 | 19 | 20 | enum { 21 | /* spinlock */ 22 | CDB_LOCKSPIN, 23 | /* mutex, which may cause OS context switch, mainly used in where Disk IO happens */ 24 | CDB_LOCKMUTEX, 25 | }; 26 | 27 | /* may be used to indicated whether the area is protected */ 28 | enum { 29 | CDB_LOCKED, 30 | CDB_NOTLOCKED, 31 | }; 32 | 33 | typedef struct CDBLOCK 34 | { 35 | int ltype; 36 | char lock[0]; 37 | } CDBLOCK; 38 | 39 | 40 | CDBLOCK *cdb_lock_new(int ltype); 41 | void cdb_lock_lock(CDBLOCK *lock); 42 | void cdb_lock_unlock(CDBLOCK *lock); 43 | void cdb_lock_destory(CDBLOCK *lock); 44 | int cdb_lock_trylock(CDBLOCK *lock); 45 | 46 | 47 | 48 | #endif 49 | 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012, Siyuan Fu 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | -------------------------------------------------------------------------------- /src/cdb_bgtask.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_BGTASK_H_ 17 | #define _CDB_BGTASK_H_ 18 | #include 19 | #include 20 | 21 | 22 | /* 16 tasks at most in a task thread */ 23 | #define MAXTASKNUM 16 24 | 25 | typedef void (*TASKFUNC)(void *); 26 | 27 | /* struct for timer task */ 28 | typedef struct { 29 | /* task function */ 30 | TASKFUNC func; 31 | /* task argument */ 32 | void *arg; 33 | /* task run interval(seconds) */ 34 | int intval; 35 | /* time of last run */ 36 | time_t ltime; 37 | } TASK; 38 | 39 | /* struct for a background task manager */ 40 | typedef struct CDBBGTASK 41 | { 42 | TASK tasks[MAXTASKNUM]; 43 | /* number of tasks */ 44 | int tnum; 45 | /* is running? */ 46 | int run; 47 | pthread_t tid; 48 | /* for wait the thread exit */ 49 | pthread_mutex_t smutex; 50 | pthread_cond_t scond; 51 | } CDBBGTASK; 52 | 53 | 54 | 55 | CDBBGTASK *cdb_bgtask_new(); 56 | int cdb_bgtask_add(CDBBGTASK *task, TASKFUNC func, void *arg, int intval); 57 | void cdb_bgtask_start(CDBBGTASK *bt); 58 | void cdb_bgtask_stop(CDBBGTASK *task); 59 | void cdb_bgtask_destroy(CDBBGTASK *task); 60 | 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /src/cdb_dumpdb.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | #include "cuttdb.h" 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | 23 | bool itcb(void *arg, const char *key, int ksize, const char *val, int vsize, uint32_t expire, uint64_t oid) 24 | { 25 | #define SBUFSIZE 4096 26 | char buf[SBUFSIZE]; 27 | char *kvbuf = buf; 28 | if (ksize + vsize + 2 > SBUFSIZE) 29 | kvbuf = (char*)malloc(ksize + vsize + 2); 30 | memcpy(kvbuf, key, ksize); 31 | kvbuf[ksize] = '\t'; 32 | memcpy(kvbuf + ksize + 1, val, vsize); 33 | kvbuf[ksize + vsize + 1] = '\0'; 34 | printf("%s\t%u\n", kvbuf, expire); 35 | if (kvbuf != buf) 36 | free(kvbuf); 37 | return true; 38 | } 39 | 40 | int main(int argc, char *argv[]) 41 | { 42 | /* 1TB */ 43 | int cache_limit = 1048576; 44 | 45 | if (argc < 2) { 46 | fprintf(stderr, "Usage: %s dbpath [cachelimit(MB)].... \n", argv[0]); 47 | return -1; 48 | } 49 | if (argc > 2) { 50 | cache_limit = atoi(argv[2]); 51 | } 52 | 53 | CDB *db = cdb_new(); 54 | cdb_option(db, 0, 0, cache_limit); 55 | if (cdb_open(db, argv[1], CDB_PAGEWARMUP) < 0) { 56 | fprintf(stderr, "Database open error, unable to recovery\n"); 57 | return -1; 58 | } 59 | void *it = cdb_iterate_new(db, 0); 60 | cdb_iterate(db, itcb, NULL, it); 61 | cdb_iterate_destroy(db, it); 62 | cdb_destroy(db); 63 | } 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #build a prototype k-v database 3 | 4 | OPT=-O2 5 | DEBUG=-g 6 | CFLAGS=-std=gnu99 -Wall -fPIC $(OPT) $(DEBUG) -DHAVE_EPOLL 7 | 8 | CC=gcc 9 | LCOMMON=-lrt -lpthread 10 | 11 | ifeq ($(GOOGPERF),yes) 12 | PROFILER=-DGOOG_PROFILER 13 | LPROFILER=-lprofiler 14 | endif 15 | 16 | OBJDIR := objs 17 | BUILDDIR := build 18 | SRCDIR := src 19 | OBJS := $(addprefix $(OBJDIR)/, cdb_bgtask.o cdb_bloomfilter.o cdb_core.o cdb_crc64.o cdb_errno.o cdb_hashtable.o cdb_lock.o cdb_vio.o vio_apnd2.o) 20 | 21 | all: library exes 22 | 23 | library: $(BUILDDIR)/libcuttdb.a $(BUILDDIR)/libcuttdb.so 24 | exes: $(BUILDDIR)/cuttdb-server $(BUILDDIR)/cdb_dumpraw $(BUILDDIR)/cdb_builddb $(BUILDDIR)/cdb_dumpdb 25 | test: $(BUILDDIR)/test_mt 26 | 27 | $(BUILDDIR)/cdb_dumpdb: $(OBJDIR)/cdb_dumpdb.o $(BUILDDIR)/libcuttdb.a 28 | $(CC) $(CFLAGS) -o $@ $^ $(LCOMMON) 29 | 30 | $(BUILDDIR)/test_mt: $(SRCDIR)/test_mt.c $(BUILDDIR)/libcuttdb.a 31 | $(CC) $(CFLAGS) -o $@ $^ $(LCOMMON) -Wno-format 32 | 33 | $(BUILDDIR)/cdb_dumpraw: $(SRCDIR)/cdb_dumpraw.c 34 | $(CC) $(CFLAGS) -o $@ $^ 35 | 36 | $(BUILDDIR)/cdb_builddb: $(OBJDIR)/cdb_builddb.o $(BUILDDIR)/libcuttdb.a 37 | $(CC) $(CFLAGS) -o $@ $^ $(LCOMMON) 38 | 39 | $(BUILDDIR)/cuttdb-server: $(OBJDIR)/cuttdb-server.o $(OBJDIR)/server-thread.o $(BUILDDIR)/libcuttdb.a 40 | $(CC) -o $@ $^ $(LCOMMON) 41 | 42 | $(BUILDDIR)/libcuttdb.so: $(OBJDIR) $(BUILDDIR) $(OBJS) 43 | $(CC) -shared -o $@ $(OBJS) $(LPROFILER) $(LCOMMON) 44 | 45 | $(BUILDDIR)/libcuttdb.a: $(OBJDIR) $(BUILDDIR) $(OBJS) 46 | ar cqs $@ $(OBJS) 47 | 48 | $(BUILDDIR): 49 | mkdir -p $(BUILDDIR) 50 | 51 | $(OBJDIR): 52 | mkdir -p $(OBJDIR) 53 | 54 | $(OBJDIR)/%.o: $(SRCDIR)/%.c 55 | $(CC) -c $(CFLAGS) -o $@ $^ $(PROFILER) 56 | 57 | clean: 58 | rm -rf $(OBJDIR) $(BUILDDIR) 59 | 60 | cleanobj: 61 | rm -rf $(OBJDIR) 62 | 63 | rebuild: clean all 64 | 65 | install: library $(SRCDIR)/cuttdb.h 66 | cp $(BUILDDIR)/libcuttdb.a $(BUILDDIR)/libcuttdb.so /usr/lib/ 67 | cp $(SRCDIR)/cuttdb.h /usr/include/ 68 | -------------------------------------------------------------------------------- /src/cdb_builddb.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | #include "cuttdb.h" 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | int main(int argc, char *argv[]) 22 | { 23 | CDB *db = cdb_new(); 24 | if (argc < 2) { 25 | fprintf(stderr, "Usage: %s db_path [hsize = 2000000]\n", argv[0]); 26 | return 0; 27 | } 28 | 29 | /* 1TB memory limit(unlimited) */ 30 | cdb_option(db, argc >= 3? atoi(argv[2]):2000000 , 0, 1048576); 31 | cdb_seterrcb(db, cdb_deferrorcb, NULL); 32 | if (cdb_open(db, argv[1], CDB_CREAT | CDB_PAGEWARMUP) < 0) { 33 | return -1; 34 | } 35 | char *buf = NULL; 36 | long count = 0; 37 | 38 | size_t size, size2; 39 | while((size = getline(&buf, &size2, stdin)) != -1) { 40 | /* remove the delimiter*/ 41 | buf[--size] = '\0'; 42 | int klen = -1; 43 | int vlen = -1; 44 | uint32_t expire = 0; 45 | int parsenum = 0; 46 | for(int i = 0; i < size; i++) { 47 | if (buf[i] == '\t') { 48 | if (klen == -1) 49 | klen = i; 50 | else { 51 | vlen = i - klen - 1; 52 | parsenum = 1; 53 | } 54 | } else if (buf[i] >= '0' && buf[i] <= '9' && parsenum) { 55 | expire = expire * 10 + buf[i] - '0'; 56 | } 57 | } 58 | 59 | if (klen > 0 && vlen > 0) { 60 | cdb_set2(db, buf, klen, buf + klen + 1, vlen, 61 | CDB_OVERWRITE, expire > 0? expire - time(NULL): 0); 62 | count++; 63 | } 64 | free(buf); 65 | buf = NULL; 66 | } 67 | cdb_destroy(db); 68 | fprintf(stderr, "imported %ld records\n", count); 69 | return 0; 70 | } 71 | 72 | 73 | -------------------------------------------------------------------------------- /src/cdb_lock.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cdb_lock.h" 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | CDBLOCK *cdb_lock_new(int ltype) 23 | { 24 | CDBLOCK *lock = NULL; 25 | if (ltype == CDB_LOCKSPIN) { 26 | lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_spinlock_t)); 27 | pthread_spin_init((pthread_spinlock_t*)&lock->lock, PTHREAD_PROCESS_PRIVATE); 28 | } else if (ltype == CDB_LOCKMUTEX) { 29 | lock = (CDBLOCK *)malloc(sizeof(CDBLOCK) + sizeof(pthread_mutex_t)); 30 | pthread_mutex_init((pthread_mutex_t*)&lock->lock, NULL); 31 | } 32 | lock->ltype = ltype; 33 | 34 | return lock; 35 | } 36 | 37 | 38 | void cdb_lock_lock(CDBLOCK *lock) 39 | { 40 | if (lock->ltype == CDB_LOCKSPIN) 41 | pthread_spin_lock((pthread_spinlock_t*)&lock->lock); 42 | else if (lock->ltype == CDB_LOCKMUTEX) 43 | pthread_mutex_lock((pthread_mutex_t*)&lock->lock); 44 | } 45 | 46 | 47 | void cdb_lock_unlock(CDBLOCK *lock) 48 | { 49 | if (lock->ltype == CDB_LOCKSPIN) 50 | pthread_spin_unlock((pthread_spinlock_t*)&lock->lock); 51 | else if (lock->ltype == CDB_LOCKMUTEX) 52 | pthread_mutex_unlock((pthread_mutex_t*)&lock->lock); 53 | } 54 | 55 | 56 | void cdb_lock_destory(CDBLOCK *lock) 57 | { 58 | if (lock->ltype == CDB_LOCKSPIN) 59 | pthread_spin_destroy((pthread_spinlock_t*)&lock->lock); 60 | else if (lock->ltype == CDB_LOCKMUTEX) 61 | pthread_mutex_destroy((pthread_mutex_t*)&lock->lock); 62 | 63 | free(lock); 64 | } 65 | 66 | 67 | int cdb_lock_trylock(CDBLOCK *lock) 68 | { 69 | if (lock->ltype == CDB_LOCKSPIN) 70 | return pthread_spin_trylock((pthread_spinlock_t*)&lock->lock); 71 | else if (lock->ltype == CDB_LOCKMUTEX) 72 | return pthread_mutex_trylock((pthread_mutex_t*)&lock->lock); 73 | return 0; 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/cdb_errno.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cuttdb.h" 17 | #include "cdb_errno.h" 18 | #include "cdb_types.h" 19 | #include "cdb_core.h" 20 | #include 21 | 22 | 23 | int cdb_errno(CDB *db) 24 | { 25 | return (long)pthread_getspecific(*(pthread_key_t*)db->errkey); 26 | } 27 | 28 | const char *cdb_errmsg(int ecode) 29 | { 30 | switch(ecode) { 31 | case CDB_SUCCESS: 32 | return "Success"; 33 | case CDB_NOTFOUND: 34 | return "Key Not Found"; 35 | case CDB_EXIST: 36 | return "Item Already Exists"; 37 | case CDB_DIRNOEXIST: 38 | return "Path Open Failed"; 39 | case CDB_OPENERR: 40 | return "File Open Failed"; 41 | case CDB_PIDEXIST: 42 | return "Opened By Another Process"; 43 | case CDB_DATAERRDAT: 44 | return "Data File Content Error"; 45 | case CDB_DATAERRIDX: 46 | return "Index File Content Error"; 47 | case CDB_WRITEERR: 48 | return "Write To File Error"; 49 | case CDB_READERR: 50 | return "Read From File Error"; 51 | case CDB_NOFID: 52 | return "Internal File Lost"; 53 | case CDB_INTERNALERR: 54 | return "Internal Error"; 55 | case CDB_DATAERRMETA: 56 | return "File Header Error"; 57 | case CDB_MEMDBNOCACHE: 58 | return "MemDB Mode With Zero Record Cache Size"; 59 | default: 60 | return "Error For Errno"; 61 | } 62 | } 63 | 64 | 65 | void cdb_seterrcb(CDB *db, CDB_ERRCALLBACK errcb, void *arg) 66 | { 67 | db->errcb = errcb; 68 | db->errcbarg = arg; 69 | } 70 | 71 | 72 | void cdb_seterrno(CDB *db, int ecode, const char *source, int line) 73 | { 74 | pthread_setspecific(*(pthread_key_t*)db->errkey, (void*)(long)ecode); 75 | if (ecode != CDB_SUCCESS && db->errcb) { 76 | db->errcb(db->errcbarg, ecode, source, line); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/ae_select.c: -------------------------------------------------------------------------------- 1 | /* Select()-based ae.c module 2 | * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com 3 | * Released under the BSD license. See the COPYING file for more info. */ 4 | 5 | #include 6 | 7 | typedef struct aeApiState { 8 | fd_set rfds, wfds; 9 | /* We need to have a copy of the fd sets as it's not safe to reuse 10 | * FD sets after select(). */ 11 | fd_set _rfds, _wfds; 12 | } aeApiState; 13 | 14 | static int aeApiCreate(EventLoop *eventLoop) { 15 | aeApiState *state = malloc(sizeof(aeApiState)); 16 | 17 | if (!state) return -1; 18 | FD_ZERO(&state->rfds); 19 | FD_ZERO(&state->wfds); 20 | eventLoop->apidata = state; 21 | return 0; 22 | } 23 | 24 | static void aeApiFree(EventLoop *eventLoop) { 25 | free(eventLoop->apidata); 26 | } 27 | 28 | static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { 29 | aeApiState *state = eventLoop->apidata; 30 | 31 | if (mask & AE_READABLE) FD_SET(fd,&state->rfds); 32 | if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds); 33 | return 0; 34 | } 35 | 36 | static void aeApiDelEvent(EventLoop *eventLoop, int fd, int mask) { 37 | aeApiState *state = eventLoop->apidata; 38 | 39 | if (mask & AE_READABLE) FD_CLR(fd,&state->rfds); 40 | if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds); 41 | } 42 | 43 | static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { 44 | aeApiState *state = eventLoop->apidata; 45 | int retval, j, numevents = 0; 46 | 47 | memcpy(&state->_rfds,&state->rfds,sizeof(fd_set)); 48 | memcpy(&state->_wfds,&state->wfds,sizeof(fd_set)); 49 | 50 | retval = select(eventLoop->maxfd+1, 51 | &state->_rfds,&state->_wfds,NULL,tvp); 52 | if (retval > 0) { 53 | for (j = 0; j <= eventLoop->maxfd; j++) { 54 | int mask = 0; 55 | aeFileEvent *fe = &eventLoop->events[j]; 56 | 57 | if (fe->mask == AE_NONE) continue; 58 | if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds)) 59 | mask |= AE_READABLE; 60 | if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds)) 61 | mask |= AE_WRITABLE; 62 | eventLoop->fired[numevents].fd = j; 63 | eventLoop->fired[numevents].mask = mask; 64 | numevents++; 65 | } 66 | } 67 | return numevents; 68 | } 69 | 70 | static char *aeApiName(void) { 71 | return "select"; 72 | } 73 | -------------------------------------------------------------------------------- /src/ae_kqueue.c: -------------------------------------------------------------------------------- 1 | /* Kqueue(2)-based ae.c module 2 | * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com 3 | * Released under the BSD license. See the COPYING file for more info. */ 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | typedef struct aeApiState { 10 | int kqfd; 11 | struct kevent events[AE_SETSIZE]; 12 | } aeApiState; 13 | 14 | static int aeApiCreate(EventLoop *eventLoop) { 15 | aeApiState *state = malloc(sizeof(aeApiState)); 16 | 17 | if (!state) return -1; 18 | state->kqfd = kqueue(); 19 | if (state->kqfd == -1) return -1; 20 | eventLoop->apidata = state; 21 | 22 | return 0; 23 | } 24 | 25 | static void aeApiFree(EventLoop *eventLoop) { 26 | aeApiState *state = eventLoop->apidata; 27 | 28 | close(state->kqfd); 29 | free(state); 30 | } 31 | 32 | static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { 33 | aeApiState *state = eventLoop->apidata; 34 | struct kevent ke; 35 | 36 | if (mask & AE_READABLE) { 37 | EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL); 38 | if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; 39 | } 40 | if (mask & AE_WRITABLE) { 41 | EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL); 42 | if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1; 43 | } 44 | return 0; 45 | } 46 | 47 | static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) { 48 | return aeApiAddEvent(eventLoop, fd, mask); 49 | } 50 | 51 | static int aeApiDelEvent(EventLoop *eventLoop, int fd) { 52 | aeApiState *state = eventLoop->apidata; 53 | struct kevent ke; 54 | 55 | EV_SET(&ke, fd, EVFILT_READ | EVFILT_WRITE, EV_DELETE, 0, 0, NULL); 56 | kevent(state->kqfd, &ke, 1, NULL, 0, NULL); 57 | return 0; 58 | } 59 | 60 | static int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { 61 | aeApiState *state = eventLoop->apidata; 62 | int retval, numevents = 0; 63 | 64 | if (tvp != NULL) { 65 | struct timespec timeout; 66 | timeout.tv_sec = tvp->tv_sec; 67 | timeout.tv_nsec = tvp->tv_usec * 1000; 68 | retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout); 69 | } else { 70 | retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL); 71 | } 72 | 73 | if (retval > 0) { 74 | int j; 75 | 76 | numevents = retval; 77 | for(j = 0; j < numevents; j++) { 78 | int mask = 0; 79 | struct kevent *e = state->events+j; 80 | 81 | if (e->filter == EVFILT_READ) mask |= AE_READABLE; 82 | if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE; 83 | eventLoop->fired[j] = e->ident; 84 | } 85 | } 86 | return numevents; 87 | } 88 | 89 | static char *aeApiName(void) { 90 | return "kqueue"; 91 | } 92 | -------------------------------------------------------------------------------- /src/cdb_bgtask.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cdb_bgtask.h" 17 | #include 18 | #include 19 | 20 | 21 | /* where thread begins */ 22 | static void *_cdb_bgtask_func(void *arg); 23 | 24 | 25 | CDBBGTASK *cdb_bgtask_new() 26 | { 27 | CDBBGTASK *bt = (CDBBGTASK *)malloc(sizeof(CDBBGTASK)); 28 | 29 | bt->tnum = 0; 30 | bt->run = 0; 31 | bt->tid = 0; 32 | pthread_cond_init(&bt->scond, NULL); 33 | pthread_mutex_init(&bt->smutex, NULL); 34 | return bt; 35 | } 36 | 37 | 38 | /* add a task into task list, must called before the thread run */ 39 | int cdb_bgtask_add(CDBBGTASK *bt, TASKFUNC func, void *arg, int intval) 40 | { 41 | TASK *task = &bt->tasks[bt->tnum]; 42 | 43 | if (bt->tid || bt->tnum > MAXTASKNUM) 44 | return -1; 45 | 46 | task->arg = arg; 47 | task->func = func; 48 | task->intval = intval; 49 | task->ltime = time(NULL); 50 | bt->tnum++; 51 | return 0; 52 | } 53 | 54 | 55 | static void *_cdb_bgtask_func(void *arg) 56 | { 57 | CDBBGTASK *bt = (CDBBGTASK *)arg; 58 | 59 | /* block all signals coming into current thread */ 60 | sigset_t smask; 61 | sigfillset(&smask); 62 | pthread_sigmask(SIG_BLOCK, &smask, NULL); 63 | 64 | /* loop */ 65 | while(bt->run) { 66 | time_t now = time(NULL); 67 | struct timespec timeout; 68 | 69 | /* check should run some tasks every 1 second */ 70 | timeout.tv_sec = now + 1; 71 | timeout.tv_nsec = 0; 72 | 73 | /* iterate and run the tasks */ 74 | for(int i = 0; i < bt->tnum; i++) { 75 | TASK *task = &bt->tasks[i]; 76 | if (now >= task->ltime + task->intval) { 77 | task->func(task->arg); 78 | task->ltime = now; 79 | } 80 | } 81 | pthread_cond_timedwait(&bt->scond, &bt->smutex, &timeout); 82 | } 83 | 84 | return NULL; 85 | } 86 | 87 | 88 | /* create a thread for tasks */ 89 | void cdb_bgtask_start(CDBBGTASK *bt) 90 | { 91 | if (bt->run) 92 | return; 93 | 94 | bt->run = 1; 95 | pthread_create(&bt->tid, NULL, _cdb_bgtask_func, bt); 96 | return; 97 | } 98 | 99 | 100 | /* wait for the task thread exits */ 101 | void cdb_bgtask_stop(CDBBGTASK *bt) 102 | { 103 | if (bt->run) { 104 | void **ret = NULL; 105 | bt->run = 0; 106 | pthread_cond_signal(&bt->scond); 107 | pthread_join(bt->tid, ret); 108 | } 109 | 110 | bt->tnum = 0; 111 | } 112 | 113 | 114 | void cdb_bgtask_destroy(CDBBGTASK *bt) 115 | { 116 | cdb_bgtask_stop(bt); 117 | pthread_cond_destroy(&bt->scond); 118 | pthread_mutex_destroy(&bt->smutex); 119 | free(bt); 120 | } 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CuttDB 2 | ======== 3 | A hash-based key-value database for persistent storing massive small records, initially designed to be a indexed repository for ten to hundreds of millions URLs, web pages and small documents. 4 | 5 | 6 | Features 7 | -------- 8 | - It is a database library to be embeded into other programs. 9 | - Keys and values are arbitrary byte arrays, value is retrieved by key. 10 | - Very good performance, especially for insert operation, even when stored massive amount of records. 11 | - Designed sophisticated cache on both index pages and records to take use of memory better. 12 | - Data on disk is always write ahead to minimize the possibility of data loss. 13 | - Bloom filter is included to accelerate the query on inexistent record 14 | - Record expiration and space recycle are supported. 15 | - Multithreading is supported. 16 | - Server side with memcached protocols(set/add/replace/get/delete) is also supported now. 17 | 18 | Limitations 19 | ----------- 20 | - SQL or relation data model is not supported. 21 | - The index structure is based on hash, so prefix query or ordered iteration is not supported. 22 | - Transaction is not supported. 23 | - Update operation(set for exist record) is less efficient 24 | 25 | Performance 26 | ----------- 27 | Simple test on a machine with Core 2 Duo E4500@2.2GHz/8GB RAM/7200RPM SATA/Linux 3.2.0-25 28 | ```sh 29 | Insert 200,000,000 records, key and value are both 8-byte strings: 30 | Overall: 200000000 / 200000000 (458.941 s, 435784 ops) 31 | Now program consumes 2.0 GB ram. 32 | 33 | 34 | Clean OS cache by 'echo 3 > /proc/sys/vm/drop_cache'. 35 | Retrieve 50000 records randomly: 36 | Overall: 50000 / 50000 (459.279 s, 108 ops) 37 | 38 | 39 | Retrieve these records again: 40 | Overall: 50000 / 50000 (0.037 s, 1315789 ops) 41 | ``` 42 | 43 | a detailed test result please refer to [200,000,000 benchmark][benchmark]. 44 | 45 | Usage 46 | ----- 47 | ### compile & use the library 48 | ```sh 49 | $ git clone "http://cuttdb.googlecode.com/svn/trunk/" cuttdb 50 | $ cd cuttdb/src ; make ; sudo make install 51 | $ vim test.c 52 | $ gcc test.c -lcuttdb; mkdir testdb; ./a.out 53 | ``` 54 | contents in test.c : 55 | ```c 56 | #include 57 | #include 58 | 59 | int main() 60 | { 61 | CDB *db = cdb_new(); 62 | cdb_option(db, 20000, 16, 16); 63 | if (cdb_open(db, "./testdb/", CDB_CREAT) < 0) { 64 | printf("Open Failed\n"); 65 | return -1; 66 | } 67 | cdb_set(db, "key1", 4, "HELLO1\0", 7); 68 | 69 | char *val; int vsize; 70 | cdb_get(db, "key1", 4, (void**)&val, &vsize); 71 | printf("value for key1:%s[%d]\n", val, vsize); 72 | cdb_free_val((void**)&val); 73 | cdb_destroy(db); 74 | } 75 | ``` 76 | 77 | ### Server version: 78 | ```sh 79 | $ ./cuttdb-server -H /data/testdb -r 0 -P 1024 -n 100000 -d -t 4 80 | $ telnet 127.0.0.1 8964 81 | 82 | 83 | Trying 127.0.0.1... 84 | Connected to 127.0.0.1. 85 | Escape character is '^]'. 86 | set test_key1 0 0 10 87 | test_value 88 | STORED 89 | get test_key1 90 | VALUE test_key1 0 10 91 | test_value 92 | END 93 | delete test_key1 94 | DELETED 95 | get test_key1 96 | END 97 | ``` 98 | 99 | The program is currently used in web crawler at cutt.com 100 | 101 | [benchmark]:http://cuttdb.googlecode.com/files/benchmark-20121025-200000000-8-8.result.txt 102 | -------------------------------------------------------------------------------- /src/cdb_dumpraw.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #define SI4 4 26 | #define SI8 8 27 | 28 | /* data record */ 29 | typedef struct { 30 | /* disk store starts at following field */ 31 | uint32_t magic; 32 | uint32_t ksize; 33 | uint32_t vsize; 34 | uint32_t expire; 35 | uint64_t oid; 36 | char buf[0]; 37 | } __attribute__((packed)) CDBREC; 38 | 39 | /* real size of a record header when stored on disk */ 40 | #define RECHSIZE (SI4 * 4 + SI8) 41 | /* real size of a record when stored on disk */ 42 | #define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize) 43 | 44 | #define FILEMETASIZE 64 45 | #define ALIGNBYTES 16 46 | #define RECMAGIC 0x19871022 47 | #define DELRECMAGIC 0x19871023 48 | #define FILEMAGICHEADER "CuTtDbFiLePaRtIaL" 49 | #define FILEMAGICLEN (strlen(FILEMAGICHEADER)) 50 | #define OFFALIGNED(off) (((off) & (ALIGNBYTES - 1))? ((off) | (ALIGNBYTES - 1)) + 1: off) 51 | 52 | 53 | 54 | void process(const char *filename) 55 | { 56 | #define SBUFSIZE 4096 57 | int fd = open(filename, O_RDONLY, 0644); 58 | char buf[SBUFSIZE]; 59 | if (fd < 0) 60 | fprintf(stderr, "%s Open failed\n", filename); 61 | 62 | long filesize = lseek(fd, 0, SEEK_END); 63 | long pos = FILEMETASIZE; 64 | char *map = (char*)mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); 65 | if (memcmp(map, FILEMAGICHEADER, FILEMAGICLEN)) { 66 | fprintf(stderr, "%s is not a cuttdb file\n", filename); 67 | close(fd); 68 | return; 69 | } 70 | 71 | while(pos < filesize) { 72 | char *kvbuf = buf; 73 | CDBREC *rec = (CDBREC*)&map[pos]; 74 | if (rec->magic != RECMAGIC && rec->magic != DELRECMAGIC) { 75 | pos += ALIGNBYTES; 76 | continue; 77 | } 78 | 79 | pos += OFFALIGNED(RECSIZE(rec)); 80 | if (rec->magic != RECMAGIC) 81 | continue; 82 | 83 | if (rec->ksize + rec->vsize + 2 > SBUFSIZE) { 84 | kvbuf = (char*)malloc(rec->ksize + rec->vsize + 2); 85 | } 86 | memcpy(kvbuf, rec->buf, rec->ksize); 87 | kvbuf[rec->ksize] = '\t'; 88 | memcpy(kvbuf + rec->ksize + 1, rec->buf + rec->ksize, rec->vsize); 89 | kvbuf[rec->ksize + rec->vsize + 1] = '\0'; 90 | printf("%s\t%u\n", kvbuf, rec->expire); 91 | if (kvbuf != buf) 92 | free(kvbuf); 93 | } 94 | 95 | munmap(map, filesize); 96 | close(fd); 97 | } 98 | 99 | 100 | 101 | 102 | int main(int argc, char *argv[]) 103 | { 104 | if (argc < 2) { 105 | fprintf(stderr, "Usage: %s dat########.cdb dat########.cdb .... \n", argv[0]); 106 | return 0; 107 | } 108 | for(int i = 1; i < argc; i++) 109 | process(argv[i]); 110 | return 0; 111 | } 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /src/cdb_vio.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_VIO_H_ 17 | #define _CDB_VIO_H_ 18 | #include "cdb_types.h" 19 | #include "cuttdb.h" 20 | #include 21 | #include 22 | 23 | enum { 24 | /* obsoleted */ 25 | CDBVIOAPPEND, 26 | /* append only format storage */ 27 | CDBVIOAPND2, 28 | }; 29 | 30 | typedef struct CDBVIO CDBVIO; 31 | 32 | /* write a record, returns virtual offset at 3rd parameter */ 33 | typedef int (*VIOWRITEREC)(CDBVIO*, CDBREC*, FOFF*); 34 | /* delete a record, pass in the current offset at 3rd parameter */ 35 | typedef int (*VIODELETEREC)(CDBVIO*, CDBREC*, FOFF); 36 | /* read a record, 2nd parameter default points to stack buffer, if its real size 37 | greater than the stack buffer size, it will be changed to points to a space in heap, 38 | the last parameter decides whether read the whole record or just read key for comparsion */ 39 | typedef int (*VIOREADREC)(CDBVIO*, CDBREC**, FOFF, bool); 40 | /* close the storage */ 41 | typedef int (*VIOCLOSE)(CDBVIO*); 42 | /* open the storage, pass in the storage path and open mode */ 43 | typedef int (*VIOOPEN)(CDBVIO*, const char*, int); 44 | /* write an index page, return its virtual offset at 3rd parameter */ 45 | typedef int (*VIOWRITEPAGE)(CDBVIO*, CDBPAGE *, FOFF*); 46 | /* read an index page, 2nd parameter default points to stack buffer, if its real size 47 | greater than the stack buffer size, it will be changed to points to a space in heap */ 48 | typedef int (*VIOREADPAGE)(CDBVIO*, CDBPAGE **, FOFF); 49 | /* make the storage do an sync operation */ 50 | typedef int (*VIOSYNC)(CDBVIO*); 51 | /* write db header, which contains main-index */ 52 | typedef int (*VIOWRITEHEAD)(CDBVIO*); 53 | /* read db header, which contains main-index */ 54 | typedef int (*VIOREADHEAD)(CDBVIO*); 55 | /* tell that no dirty page exists */ 56 | typedef void (*VIOCLEANPOINT)(CDBVIO*); 57 | /* get the record/page iterator at oid */ 58 | typedef void* (*VIOITFIRST)(CDBVIO *, uint64_t oid); 59 | /* get the next index page by iterator */ 60 | typedef int (*VIOPAGEITNEXT)(CDBVIO *, CDBPAGE **, void *); 61 | /* get the next record by iterator */ 62 | typedef int (*VIORECITNEXT)(CDBVIO *, CDBREC **, void *); 63 | /* destroy and free the iterator */ 64 | typedef void (*VIOITDESTROY)(CDBVIO *, void *); 65 | 66 | struct CDBVIO 67 | { 68 | VIOOPEN open; 69 | VIOCLOSE close; 70 | 71 | VIOWRITEREC wrec; 72 | VIODELETEREC drec; 73 | VIOREADREC rrec; 74 | 75 | VIOWRITEPAGE wpage; 76 | VIOREADPAGE rpage; 77 | 78 | VIOSYNC sync; 79 | VIOWRITEHEAD whead; 80 | VIOREADHEAD rhead; 81 | 82 | VIOCLEANPOINT cleanpoint; 83 | 84 | VIOITFIRST pageitfirst; 85 | VIOPAGEITNEXT pageitnext; 86 | VIOITDESTROY pageitdestroy; 87 | 88 | VIOITFIRST recitfirst; 89 | VIORECITNEXT recitnext; 90 | VIOITDESTROY recitdestroy; 91 | 92 | CDB *db; 93 | void *iometa; 94 | }; 95 | 96 | 97 | CDBVIO *cdb_vio_new(int type); 98 | int cdb_vio_destroy(CDBVIO *vio); 99 | 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /src/ae_epoll.c: -------------------------------------------------------------------------------- 1 | /* Linux epoll(2) based ae.c module 2 | * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com 3 | * Released under the BSD license. See the COPYING file for more info. */ 4 | 5 | #include 6 | #include 7 | 8 | typedef struct aeApiState { 9 | int epfd; 10 | struct epoll_event events[AE_SETSIZE]; 11 | } aeApiState; 12 | 13 | static int aeApiCreate(EventLoop *eventLoop) { 14 | aeApiState *state = malloc(sizeof(aeApiState)); 15 | 16 | if (!state) return -1; 17 | state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */ 18 | if (state->epfd == -1) return -1; 19 | eventLoop->apidata = state; 20 | return 0; 21 | } 22 | 23 | /* 24 | be not referenced anywhere 25 | static void aeApiFree(EventLoop *eventLoop) { 26 | aeApiState *state = eventLoop->apidata; 27 | 28 | close(state->epfd); 29 | free(state); 30 | } 31 | */ 32 | 33 | static int aeApiAddEvent(EventLoop *eventLoop, int fd, int mask) { 34 | aeApiState *state = eventLoop->apidata; 35 | struct epoll_event ee; 36 | ee.events = EPOLLONESHOT; 37 | if (mask & AE_READABLE) ee.events |= EPOLLIN; 38 | if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; 39 | ee.data.u64 = 0; /* avoid valgrind warning */ 40 | ee.data.fd = fd; 41 | if (epoll_ctl(state->epfd, EPOLL_CTL_ADD,fd,&ee) == -1 && errno != EEXIST) { 42 | fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno); 43 | return -1; 44 | } 45 | return 0; 46 | } 47 | 48 | static int aeApiUpdateEvent(EventLoop *eventLoop, int fd, int mask) { 49 | aeApiState *state = eventLoop->apidata; 50 | struct epoll_event ee; 51 | ee.events = EPOLLONESHOT; 52 | if (mask & AE_READABLE) ee.events |= EPOLLIN; 53 | if (mask & AE_WRITABLE) ee.events |= EPOLLOUT; 54 | ee.data.u64 = 0; /* avoid valgrind warning */ 55 | ee.data.fd = fd; 56 | if (epoll_ctl(state->epfd, EPOLL_CTL_MOD,fd,&ee) == -1) { 57 | fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_ADD,fd,errno); 58 | return -1; 59 | } 60 | return 0; 61 | } 62 | 63 | static int aeApiDelEvent(EventLoop *eventLoop, int fd) { 64 | aeApiState *state = eventLoop->apidata; 65 | struct epoll_event ee; 66 | 67 | ee.events = 0; 68 | ee.data.u64 = 0; /* avoid valgrind warning */ 69 | ee.data.fd = fd; 70 | /* Note, Kernel < 2.6.9 requires a non null event pointer even for 71 | * EPOLL_CTL_DEL. */ 72 | if ( epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee) == -1 73 | && errno != ENOENT && errno != EBADF) { 74 | fprintf(stderr, "epoll_ctl(%d,%d) failed: %d\n", EPOLL_CTL_DEL,fd,errno); 75 | return -1; 76 | } 77 | return 0; 78 | } 79 | 80 | int aeApiPoll(EventLoop *eventLoop, struct timeval *tvp) { 81 | aeApiState *state = eventLoop->apidata; 82 | int retval, numevents = 0; 83 | 84 | retval = epoll_wait(state->epfd,state->events,AE_SETSIZE, 85 | tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1); 86 | if (retval > 0) { 87 | int j; 88 | 89 | numevents = retval; 90 | for (j = 0; j < numevents; j++) { 91 | int mask = 0; 92 | struct epoll_event *e = state->events+j; 93 | 94 | if (e->events & EPOLLIN) mask |= AE_READABLE; 95 | if (e->events & EPOLLOUT) mask |= AE_WRITABLE; 96 | eventLoop->fired[j] = e->data.fd; 97 | } 98 | } 99 | return numevents; 100 | } 101 | 102 | 103 | /* 104 | be not referenced anywhere 105 | static char *aeApiName(void) { 106 | return "epoll"; 107 | } 108 | */ 109 | 110 | -------------------------------------------------------------------------------- /src/cdb_core.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_CORE_H_ 17 | #define _CDB_CORE_H_ 18 | #include "cuttdb.h" 19 | #include "cdb_types.h" 20 | #include "cdb_hashtable.h" 21 | #include "cdb_bloomfilter.h" 22 | #include "cdb_lock.h" 23 | #include "cdb_vio.h" 24 | #include "cdb_bgtask.h" 25 | #include 26 | #include 27 | 28 | 29 | enum { 30 | CDB_PAGEDELETEOFF = 0, 31 | CDB_PAGEINSERTOFF = 1, 32 | }; 33 | 34 | /* the DB object */ 35 | struct CDB 36 | { 37 | /* size limit for record cache */ 38 | uint64_t rclimit; 39 | /* size limit for index page cache */ 40 | uint64_t pclimit; 41 | /* size of bloom filter */ 42 | uint64_t bfsize; 43 | /* record number in db */ 44 | uint64_t rnum; 45 | /* always increment operation id */ 46 | uint64_t oid; 47 | /* recovery point oid */ 48 | uint64_t roid; 49 | /* hash table size */ 50 | uint32_t hsize; 51 | /* last timestamp of no dirty page state */ 52 | uint32_t ndpltime; 53 | /* currently the database opened or not */ 54 | bool opened; 55 | /* the size for a disk seek&read, should not greater than SBUFSIZE */ 56 | uint32_t areadsize; 57 | 58 | /* record cache */ 59 | CDBHASHTABLE *rcache; 60 | /* (clean) index page cache */ 61 | CDBHASHTABLE *pcache; 62 | /* dirty index page cache */ 63 | CDBHASHTABLE *dpcache; 64 | /* Bloom Filter */ 65 | CDBBLOOMFILTER *bf; 66 | 67 | /* lock for rcache */ 68 | CDBLOCK *rclock; 69 | /* lock for pcache */ 70 | CDBLOCK *pclock; 71 | /* lock for dpcache */ 72 | CDBLOCK *dpclock; 73 | /* lock for hash table operation, split to MLOCKNUM groups */ 74 | CDBLOCK *mlock[MLOCKNUM]; 75 | /* lock for statistic */ 76 | CDBLOCK *stlock; 77 | /* lock for operation id */ 78 | CDBLOCK *oidlock; 79 | /* lock for bloom filter */ 80 | CDBLOCK *bflock; 81 | /* background tasks in another thread */ 82 | CDBBGTASK *bgtask; 83 | 84 | /* main hash table, contains 'hsize' elements */ 85 | FOFF *mtable; 86 | /* disk i/o layer object */ 87 | CDBVIO *vio; 88 | 89 | /* callback function when error occurs */ 90 | CDB_ERRCALLBACK errcb; 91 | /* argument for callback function */ 92 | void *errcbarg; 93 | /* key to get error code in current thread */ 94 | void *errkey; 95 | 96 | /* statistics below, this fields have no lock protection */ 97 | /* record cache hit/miss */ 98 | uint64_t rchit; 99 | uint64_t rcmiss; 100 | /* page cache hit/miss */ 101 | uint64_t pchit; 102 | uint64_t pcmiss; 103 | /* cumulative disk read time */ 104 | uint64_t rtime; 105 | /* number of disk read operation */ 106 | uint64_t rcount; 107 | /* cumulative disk write time */ 108 | uint64_t wtime; 109 | /* number of disk write operation */ 110 | uint64_t wcount; 111 | }; 112 | 113 | 114 | bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked); 115 | int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked); 116 | int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked); 117 | int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked); 118 | void cdb_flushalldpage(CDB *db); 119 | uint64_t cdb_genoid(CDB *db); 120 | 121 | #endif 122 | 123 | -------------------------------------------------------------------------------- /src/cdb_types.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_TYPES_H_ 17 | #define _CDB_TYPES_H_ 18 | #include 19 | 20 | #define KB 1024 21 | #define MB 1048576 22 | #define CDBMIN(a, b) ((a)<(b)?(a):(b)) 23 | #define CDBMAX(a, b) ((a)>(b)?(a):(b)) 24 | 25 | #define SI8 8 26 | #define SI4 4 27 | /* space reserved in stack for i/o, avoid some malloc/free */ 28 | #define SBUFSIZE (64 * KB) 29 | 30 | /* a default disk read size for index page, 3KB is enough(a page with 300 items) */ 31 | #define PAGEAREADSIZE (3 * KB) 32 | 33 | /* reserved in stack for matched items in a hash index page */ 34 | #define SFOFFNUM 8 35 | 36 | /* a valid virtual offset */ 37 | #define OFFNOTNULL(o) (((o).i4)||((o).i2)) 38 | /* a null virtual offset */ 39 | #define OFFNULL(o) (((o).i4==0)&&((o).i2==0)) 40 | /* nullify an offset */ 41 | #define OFFZERO(o) do{(o).i4=0;(o).i2=0;}while(0) 42 | /* offset is equal ? */ 43 | #define OFFEQ(a,b) (((a).i4==(b).i4)&&((a).i2==(b).i2)) 44 | /* hash in page is equal ? */ 45 | #define PHASHEQ(a,b) (((a).i2==(b).i2)&&((a).i1==(b).i1)) 46 | /* page size increment */ 47 | #define CDB_PAGEINCR 4 48 | 49 | 50 | /* if page cache size exceeds the limit */ 51 | #define PCOVERFLOW(db) ((db)->dpcache && (db)->dpcache->size + (db)->pcache->size > (db)->pclimit) 52 | /* if record cache size exceeds the limit */ 53 | #define RCOVERFLOW(db) ((db)->rcache && (db)->rcache->size > (db)->rclimit) 54 | 55 | /* timeout for a dirty index page stays since last modify */ 56 | #define DPAGETIMEOUT 40 57 | /* operation on main table are isolated by these locks */ 58 | #define MLOCKNUM 256 59 | 60 | #define CDBHASH64(a, b) cdb_crc64(a, b) 61 | 62 | /* all virtual offsets are 48-bits */ 63 | typedef struct FOFF 64 | { 65 | uint32_t i4; 66 | uint16_t i2; 67 | } __attribute__((packed)) FOFF; 68 | 69 | 70 | 71 | #define SFOFF (sizeof(FOFF)) 72 | 73 | 74 | /* all hash value in index page are 24-bits 75 | range 0..16M guarantee very low collision 76 | with less than a hundred records in a page */ 77 | typedef struct PHASH 78 | { 79 | uint16_t i2; 80 | uint8_t i1; 81 | } __attribute__((packed)) PHASH; 82 | 83 | 84 | /* an item in index page contains a hash and an offset */ 85 | typedef struct PITEM 86 | { 87 | FOFF off; 88 | PHASH hash; 89 | } __attribute__((packed)) PITEM; 90 | 91 | 92 | /* data record */ 93 | typedef struct CDBREC{ 94 | /* where the data come from */ 95 | FOFF ooff; 96 | uint32_t osize; 97 | 98 | /* access convenient*/ 99 | void *key; 100 | void *val; 101 | 102 | /* disk store starts at following field */ 103 | uint32_t magic; 104 | uint32_t ksize; 105 | uint32_t vsize; 106 | uint32_t expire; 107 | uint64_t oid; 108 | char buf[0]; 109 | } __attribute__((packed)) CDBREC; 110 | 111 | /* real size of a record header when stored on disk */ 112 | #define RECHSIZE (SI4 * 4 + SI8) 113 | /* real size of a record when stored on disk */ 114 | #define RECSIZE(r) (RECHSIZE + (r)->ksize + (r)->vsize) 115 | 116 | 117 | /* index page */ 118 | typedef struct CDBPAGE{ 119 | FOFF ooff; 120 | uint32_t osize; 121 | uint32_t cap; 122 | 123 | union { 124 | /* what it be on disk */ 125 | uint32_t magic; 126 | /* what it be in memory */ 127 | uint32_t mtime; 128 | }; 129 | /* which bucket it belongs to */ 130 | uint32_t bid; 131 | uint32_t num; 132 | uint64_t oid; 133 | PITEM items[0]; 134 | } __attribute__((packed)) CDBPAGE; 135 | 136 | /* real size of a page header when stored on disk */ 137 | #define PAGEHSIZE (SI4 * 3 + SI8) 138 | /* real size of a page when stored on disk */ 139 | #define PAGESIZE(p) (PAGEHSIZE + sizeof(PITEM) * (p)->num) 140 | /* in-memory size of an record structure */ 141 | #define MPAGESIZE(p) (sizeof(CDBPAGE) + sizeof(PITEM) * (p)->cap) 142 | 143 | #endif 144 | 145 | -------------------------------------------------------------------------------- /src/test_mt.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include "cuttdb.h" 22 | 23 | 24 | CDB *db; 25 | 26 | enum { 27 | SETOP, 28 | GETOP, 29 | DELOP, 30 | }; 31 | 32 | #if 1 33 | static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, GETOP}; 34 | static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, DELOP, DELOP, GETOP}; 35 | static int prob_table3[8] = {SETOP, SETOP, SETOP, DELOP, DELOP, DELOP, DELOP, GETOP}; 36 | #else 37 | static int prob_table1[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; 38 | static int prob_table2[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; 39 | static int prob_table3[8] = {SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, SETOP, GETOP}; 40 | #endif 41 | int *optable = NULL; 42 | 43 | 44 | long get_rand() 45 | { 46 | return (long)rand() * RAND_MAX + rand(); 47 | } 48 | 49 | 50 | void *test_thread(void *arg) 51 | { 52 | char key[64]; 53 | char value[128]; 54 | void *v; 55 | int knum = *(int*)arg; 56 | while(1) { 57 | int krand = get_rand() % knum; 58 | int ksize = snprintf(key, 64, "%ld%ld%ld", krand, krand, krand); 59 | int vsize = snprintf(value, 128, "%ld%ld%ld%ld%d%ld%ld%ld%ld", 60 | krand, krand, krand, krand, krand, krand, krand, krand); 61 | int op = optable[rand() & 0x07]; 62 | int expire = 600 + 20 * (rand() % 1000); 63 | switch(op) { 64 | case SETOP: 65 | if (cdb_set2(db, key, ksize, value, vsize, CDB_OVERWRITE | CDB_INSERTCACHE, expire) < 0) 66 | printf("ERROR! %s:%d\n", __FILE__, __LINE__); 67 | break; 68 | case GETOP: 69 | if (cdb_get(db, key, ksize, &v, &vsize) == -1) 70 | printf("ERROR! %s:%d\n", __FILE__, __LINE__); 71 | if (v) 72 | cdb_free_val(&v); 73 | break; 74 | case DELOP: 75 | if (cdb_del(db, key, ksize) == -1) 76 | printf("ERROR! %s:%d\n", __FILE__, __LINE__); 77 | break; 78 | default: 79 | break; 80 | } 81 | } 82 | } 83 | 84 | 85 | 86 | int main(int argc, char *argv[]) 87 | { 88 | int thread_num = 2; 89 | int record_num = 10000000; 90 | char *db_path = NULL; 91 | printf("Usage: %s db_path [record_num] [thread_num]\n", argv[0]); 92 | if (argc >= 2) 93 | db_path = argv[1]; 94 | else 95 | return -1; 96 | 97 | if (argc >= 3) 98 | record_num = atoi(argv[2]); 99 | if (argc >= 4) 100 | thread_num = atoi(argv[3]); 101 | 102 | record_num = record_num < 100? 100: record_num; 103 | thread_num = thread_num < 1? 1: thread_num; 104 | srand(time(NULL)); 105 | 106 | db = cdb_new(); 107 | cdb_option(db, record_num / 100, 0, 1024000); 108 | if (cdb_open(db, db_path, CDB_CREAT | CDB_TRUNC) < 0) { 109 | printf("DB Open err\n"); 110 | return -1; 111 | } 112 | 113 | 114 | optable = prob_table1; 115 | pthread_t threads[thread_num]; 116 | for(int i = 0; i < thread_num; i++) { 117 | pthread_create(&threads[i], NULL, test_thread, &record_num); 118 | } 119 | 120 | int clear_interval = 0; 121 | while(1) { 122 | CDBSTAT st; 123 | cdb_stat(db, &st); 124 | printf("rnum: %lu, rcnum: %lu, pnum: %lu, pcnum %lu, rlatcy: %u wlatcy: %u" 125 | " rh/m: %lu/%lu ph/m: %lu/%lu\n", 126 | st.rnum, st.rcnum, st.pnum, st.pcnum, st.rlatcy, st.wlatcy, 127 | st.rchit, st.rcmiss, st.pchit, st.pcmiss); 128 | if (++clear_interval % 20 == 0) 129 | cdb_stat(db, NULL); 130 | 131 | if (st.rnum > 0.7 * record_num) 132 | optable = prob_table2; 133 | if (st.rnum > 0.9 * record_num) 134 | optable = prob_table3; 135 | 136 | if (st.rnum < 0.8 * record_num) 137 | optable = prob_table2; 138 | 139 | if (st.rnum < 0.6 * record_num) 140 | optable = prob_table1; 141 | fflush(stdout); 142 | sleep(1); 143 | } 144 | 145 | return 0; 146 | } 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /src/cdb_bloomfilter.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cdb_bloomfilter.h" 17 | #include 18 | #include 19 | 20 | #define CDBBFHASHNUM 16 21 | #define CDBBFSPLITPOW 6 22 | 23 | static uint64_t BFSEEDS[CDBBFHASHNUM] = {217636919,290182597,386910137,515880193, 24 | 687840301,917120411,1222827239,1610612741, 25 | 3300450239,3300450259,3300450281,3300450289, 26 | 3221225473ul,4294967291ul,163227661,122420729,}; 27 | 28 | struct CDBBLOOMFILTER 29 | { 30 | uint8_t *bitmap[1<rnum = 0; 42 | bf->size = size; 43 | /* number of hash should be 0.7 * ratio */ 44 | bf->hnum = size * 8 * 7 / (rnum * 10); 45 | /* number of hash is limit in [1, 16] */ 46 | if (bf->hnum > CDBBFHASHNUM) 47 | bf->hnum = CDBBFHASHNUM; 48 | if (bf->hnum == 0) 49 | bf->hnum = 1; 50 | /* avoid malloc too much memory once */ 51 | for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) { 52 | bf->bitmap[i] = (uint8_t*)malloc(size >> CDBBFSPLITPOW); 53 | memset(bf->bitmap[i], 0, size >> CDBBFSPLITPOW); 54 | } 55 | return bf; 56 | } 57 | 58 | 59 | void cdb_bf_set(CDBBLOOMFILTER *bf, void *key, int ksize) 60 | { 61 | uint8_t *src = (uint8_t *)key, *end = src + ksize; 62 | uint64_t hval[CDBBFHASHNUM] = {0}; 63 | 64 | for(;src < end; src++) 65 | for(int i = 0; i < bf->hnum; i++) 66 | hval[i] = hval[i] * BFSEEDS[i] + *src; 67 | 68 | for(int i = 0; i < bf->hnum; i++) { 69 | uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3); 70 | uint8_t *bitmap = bf->bitmap[hval[i] & ((1<> 3] |= (1 << (p & 0x07)); 72 | } 73 | 74 | bf->rnum++; 75 | } 76 | 77 | 78 | bool cdb_bf_exist(CDBBLOOMFILTER *bf, void *key, int ksize) 79 | { 80 | uint8_t *src = (uint8_t *)key, *end = src + ksize; 81 | uint64_t hval[CDBBFHASHNUM] = {0}; 82 | int exist = 0; 83 | 84 | for(;src < end; src++) 85 | for(int i = 0; i < bf->hnum; i++) 86 | hval[i] = hval[i] * BFSEEDS[i] + *src; 87 | 88 | for(int i = 0; i < bf->hnum; i++) { 89 | uint64_t p = (hval[i] >> CDBBFSPLITPOW) % ((bf->size >> CDBBFSPLITPOW) << 3); 90 | uint8_t *bitmap = bf->bitmap[hval[i] & ((1<> 3] & (1 << (p & 0x07))) 92 | exist++; 93 | else 94 | break; 95 | } 96 | 97 | return (exist == bf->hnum); 98 | } 99 | 100 | void cdb_bf_clean(CDBBLOOMFILTER *bf) 101 | { 102 | for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) 103 | memset(bf->bitmap[i], 0, bf->size >> CDBBFSPLITPOW); 104 | 105 | bf->rnum = 0; 106 | } 107 | 108 | 109 | void cdb_bf_destroy(CDBBLOOMFILTER *bf) 110 | { 111 | for(int i = 0; i < (1 << CDBBFSPLITPOW); i++) 112 | free(bf->bitmap[i]); 113 | free(bf); 114 | } 115 | 116 | 117 | #ifdef _UT_CDBBF_ 118 | #include 119 | #include 120 | #include "cdb_bloomfilter.h" 121 | 122 | int main(int argc, char *argv[]) 123 | { 124 | int size = 1048576; 125 | int rnum = 1048576; 126 | if (argc > 1) 127 | rnum = atoi(argv[1]); 128 | if (argc > 2) 129 | size = atoi(argv[2]); 130 | 131 | CDBBLOOMFILTER *bf = cdb_bf_new(rnum, size); 132 | for(int i = 0; i < rnum; i++) { 133 | int j = 2 * i; 134 | cdb_bf_set(bf, &j, 4); 135 | } 136 | 137 | int exist = 0; 138 | for(int i = 0; i < rnum; i++) { 139 | int j = 2 * i; 140 | if (cdb_bf_exist(bf, &j, 4)) 141 | exist++; 142 | } 143 | printf("right positive: %.2f%%%%\n", (float)exist/(float)rnum*10000); 144 | 145 | exist = 0; 146 | for(int i = 0; i < rnum * 2; i++) { 147 | int j = 2 * i + 1; 148 | if (cdb_bf_exist(bf, &j, 4)) 149 | exist++; 150 | } 151 | 152 | printf("false positive: %.2f%%%% %d/%d\n", (float)exist/(float)rnum*5000, exist, rnum * 2); 153 | printf("element num: %d\n", bf->rnum); 154 | cdb_bf_destroy(bf); 155 | return 0; 156 | } 157 | #endif 158 | 159 | -------------------------------------------------------------------------------- /src/cdb_hashtable.h: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CDB_HASHTABLE_H_ 17 | #define _CDB_HASHTABLE_H_ 18 | #include 19 | #include 20 | #include 21 | 22 | #if defined(__cplusplus) 23 | extern "C" { 24 | #endif 25 | 26 | typedef uint32_t (*CDBHASHFUNC)(const void *, int); 27 | 28 | /* default 1<<8 level-1 buckets, which makes the table expanding more smoothly */ 29 | #define CDBHTBNUMPOW 8 30 | 31 | 32 | typedef struct CDBHTITEM 33 | { 34 | int ksize; 35 | int vsize; 36 | uint32_t hash; 37 | /* next element with the same hash */ 38 | struct CDBHTITEM *hnext; 39 | /* if LRU is true, the first several bytes are two pointers of prev/next element */ 40 | struct CDBHTITEM *lruptr[0]; 41 | char buf[0]; 42 | } __attribute__((packed)) CDBHTITEM; 43 | 44 | 45 | typedef struct { 46 | /* array for items */ 47 | CDBHTITEM **items; 48 | /* number of allocated slots in the bucket */ 49 | uint32_t bnum; 50 | /* number of items exist in the bucket */ 51 | uint32_t rnum; 52 | } CDBHTBUCKET; 53 | 54 | 55 | typedef struct CDBHASHTABLE { 56 | /* is in LRU mode? */ 57 | bool lru; 58 | /* user specified hash function */ 59 | CDBHASHFUNC hash; 60 | /* fixed number for level-1 buckets */ 61 | CDBHTBUCKET buckets[1<buf + ht->lru * 2 * sizeof(void*)) */ 75 | void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item); 76 | 77 | /* get the pointer of value in current item */ 78 | /* #define cdb_ht_itemval(ht, item) (item->buf + ht->lru * 2 * sizeof(void*) + item->ksize) */ 79 | void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item); 80 | 81 | /* create an hashtable, it can be a simple hashtable or with LeastRecentUse 82 | The LRU mode needs extra two pointer space for every element 83 | hash function can by specified by user */ 84 | CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc); 85 | 86 | /* clean and free the hastable */ 87 | void cdb_ht_destroy(CDBHASHTABLE *ht); 88 | 89 | /* allocate a new item with specified size, but do not insert it into table */ 90 | CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize); 91 | 92 | /* insert an item which already exists into table */ 93 | void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item); 94 | 95 | /* allocate and insert an item into table by key and value, return the pointer of value in table */ 96 | void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize); 97 | 98 | /* get the value of an item and its size in table, move the item to front if mtf == true */ 99 | void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf); 100 | 101 | /* get the value of an item, assume the size is known, move the item to front if mtf == true */ 102 | void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf); 103 | 104 | /* get the pointer of an item, it hasn't been copied */ 105 | CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf); 106 | 107 | /* check if an item with the key exists */ 108 | bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize); 109 | 110 | /* delete and free an item from table by its key */ 111 | int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize); 112 | 113 | /* return and delete an item from table, the item should be freed by user */ 114 | CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize); 115 | 116 | /* delete and free the last item in table */ 117 | void cdb_ht_removetail(CDBHASHTABLE *ht); 118 | 119 | /* return last item in table, do not delete nor free */ 120 | CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht); 121 | 122 | /* return last item in table, delete but should be freed by user */ 123 | CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht); 124 | 125 | /* clean and free all elements in the table*/ 126 | void cdb_ht_clean(CDBHASHTABLE *ht); 127 | 128 | /* iterate the table by get the front one firstly */ 129 | CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht); 130 | 131 | /* get the next item of current element */ 132 | CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur); 133 | 134 | #if defined(__cplusplus) 135 | } 136 | #endif 137 | 138 | #endif 139 | 140 | -------------------------------------------------------------------------------- /src/server-thread.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * The server&network part of CuttDB is based on Beansdb: 8 | * 9 | * http://beansdb.googlecode.com 10 | * 11 | * Beansdb is most based on Memcachedb and Memcached: 12 | * 13 | * http://memcachedb.org/ 14 | * http://danga.com/memcached/ 15 | * 16 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 17 | * Use and distribution licensed under the BSD license. 18 | * See the LICENSE file for full text 19 | * 20 | * Author: Siyuan Fu 21 | * 22 | */ 23 | 24 | #include "cuttdb-server.h" 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #ifdef HAVE_MALLOC_H 34 | #include 35 | #endif 36 | 37 | #ifdef HAVE_STRING_H 38 | #include 39 | #endif 40 | 41 | #include 42 | 43 | typedef struct EventLoop { 44 | // int maxfd; 45 | conn* conns[AE_SETSIZE]; 46 | int fired[AE_SETSIZE]; 47 | int nready; 48 | void *apidata; 49 | } EventLoop; 50 | 51 | /* Lock for connection freelist */ 52 | static pthread_mutex_t conn_lock; 53 | 54 | /* Lock for item buffer freelist */ 55 | static pthread_mutex_t ibuffer_lock; 56 | 57 | static EventLoop loop; 58 | static pthread_mutex_t leader; 59 | 60 | /* 61 | * Pulls a conn structure from the freelist, if one is available. 62 | */ 63 | conn *mt_conn_from_freelist() { 64 | conn *c; 65 | pthread_mutex_lock(&conn_lock); 66 | c = do_conn_from_freelist(); 67 | pthread_mutex_unlock(&conn_lock); 68 | return c; 69 | } 70 | 71 | /* 72 | * Adds a conn structure to the freelist. 73 | * 74 | * Returns 0 on success, 1 if the structure couldn't be added. 75 | */ 76 | bool mt_conn_add_to_freelist(conn *c) { 77 | bool result; 78 | 79 | pthread_mutex_lock(&conn_lock); 80 | result = do_conn_add_to_freelist(c); 81 | pthread_mutex_unlock(&conn_lock); 82 | 83 | return result; 84 | } 85 | 86 | 87 | /******************************* GLOBAL STATS ******************************/ 88 | 89 | void mt_stats_lock() { 90 | } 91 | 92 | void mt_stats_unlock() { 93 | } 94 | 95 | /* Include the best multiplexing layer supported by this system. 96 | * The following should be ordered by performances, descending. */ 97 | #ifdef HAVE_EPOLL 98 | #include "ae_epoll.c" 99 | #else 100 | #ifdef HAVE_KQUEUE 101 | #include "ae_kqueue.c" 102 | #else 103 | #include "ae_select.c" 104 | #endif 105 | #endif 106 | 107 | /* 108 | * Initializes the thread subsystem, creating various worker threads. 109 | * 110 | * nthreads Number of event handler threads to spawn 111 | */ 112 | void thread_init(int nthreads) { 113 | pthread_mutex_init(&ibuffer_lock, NULL); 114 | pthread_mutex_init(&conn_lock, NULL); 115 | pthread_mutex_init(&leader, NULL); 116 | 117 | memset(&loop, 0, sizeof(loop)); 118 | if (aeApiCreate(&loop) == -1) { 119 | exit(1); 120 | } 121 | } 122 | 123 | int add_event(int fd, int mask, conn *c) 124 | { 125 | if (fd >= AE_SETSIZE) { 126 | fprintf(stderr, "fd is too large: %d\n", fd); 127 | return AE_ERR; 128 | } 129 | assert(loop.conns[fd] == NULL); 130 | loop.conns[fd] = c; 131 | if (aeApiAddEvent(&loop, fd, mask) == -1){ 132 | loop.conns[fd] = NULL; 133 | return AE_ERR; 134 | } 135 | // if (fd > loop.maxfd) 136 | // loop.maxfd = fd; 137 | return AE_OK; 138 | } 139 | 140 | int update_event(int fd, int mask, conn *c) 141 | { 142 | loop.conns[fd] = c; 143 | if (aeApiUpdateEvent(&loop, fd, mask) == -1){ 144 | loop.conns[fd] = NULL; 145 | return AE_ERR; 146 | } 147 | return AE_OK; 148 | } 149 | 150 | int delete_event(int fd) 151 | { 152 | if (fd >= AE_SETSIZE) return -1; 153 | if (loop.conns[fd] == NULL) return 0; 154 | if (aeApiDelEvent(&loop, fd) == -1) 155 | return -1; 156 | loop.conns[fd] = NULL; 157 | return 0; 158 | } 159 | 160 | static void *worker_main(void *arg) { 161 | pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, 0); 162 | 163 | struct timeval tv = {1, 0}; 164 | while (!daemon_quit) { 165 | pthread_mutex_lock(&leader); 166 | 167 | AGAIN: 168 | while(loop.nready == 0 && daemon_quit == 0) 169 | loop.nready = aeApiPoll(&loop, &tv); 170 | if (daemon_quit) { 171 | pthread_mutex_unlock(&leader); 172 | break; 173 | } 174 | 175 | loop.nready --; 176 | int fd = loop.fired[loop.nready]; 177 | conn *c = loop.conns[fd]; 178 | if (c == NULL){ 179 | fprintf(stderr, "Bug: conn %d should not be NULL\n", fd); 180 | close(fd); 181 | goto AGAIN; 182 | } 183 | loop.conns[fd] = NULL; 184 | pthread_mutex_unlock(&leader); 185 | 186 | drive_machine(c); 187 | if (c->ev_flags > 0) { 188 | update_event(fd, c->ev_flags, c); 189 | } 190 | } 191 | return NULL; 192 | } 193 | 194 | void loop_run(int nthread) 195 | { 196 | int i, ret; 197 | pthread_attr_t attr; 198 | pthread_attr_init(&attr); 199 | pthread_t* tids = malloc(sizeof(pthread_t) * nthread); 200 | 201 | for (i=0; i 12 | * 13 | */ 14 | 15 | 16 | #ifndef _CUTTDB_H_ 17 | #define _CUTTDB_H_ 18 | #include 19 | #include 20 | 21 | #if defined(__cplusplus) 22 | extern "C" { 23 | #endif 24 | 25 | typedef struct CDB CDB; 26 | typedef void (*CDB_ERRCALLBACK)(void *, int, const char *, int); 27 | typedef bool (*CDB_ITERCALLBACK)(void *, const char *, int, const char *, int, uint32_t, uint64_t); 28 | 29 | /* performance statistical information of an database instance */ 30 | typedef struct { 31 | /* number of records in db */ 32 | uint64_t rnum; 33 | /* number of records in cache */ 34 | uint64_t rcnum; 35 | /* number of pages in db */ 36 | uint64_t pnum; 37 | /* number of pages in cache */ 38 | uint64_t pcnum; 39 | /* cache hit of record cache */ 40 | uint64_t rchit; 41 | /* cache miss of record cache */ 42 | uint64_t rcmiss; 43 | /* cache hit of page cache */ 44 | uint64_t pchit; 45 | /* cache miss of page cache */ 46 | uint64_t pcmiss; 47 | /* average disk read latency */ 48 | uint32_t rlatcy; 49 | /* average disk write latency */ 50 | uint32_t wlatcy; 51 | } CDBSTAT; 52 | 53 | /* options to open a database*/ 54 | enum { 55 | /* create an database if not exist */ 56 | CDB_CREAT = 0x1, 57 | /* clean the database if already exist */ 58 | CDB_TRUNC = 0x2, 59 | /* fill the cache when start up */ 60 | CDB_PAGEWARMUP = 0x4, 61 | }; 62 | 63 | /* error codes */ 64 | enum { 65 | CDB_SUCCESS = 0, 66 | CDB_NOTFOUND, 67 | CDB_EXIST, 68 | CDB_DIRNOEXIST, 69 | CDB_OPENERR, 70 | CDB_PIDEXIST, 71 | CDB_DATAERRDAT, 72 | CDB_DATAERRIDX, 73 | CDB_WRITEERR, 74 | CDB_READERR, 75 | CDB_NOFID, 76 | CDB_INTERNALERR, 77 | CDB_DATAERRMETA, 78 | CDB_MEMDBNOCACHE, 79 | }; 80 | 81 | /* record insertion options */ 82 | enum { 83 | CDB_OVERWRITE = 0, 84 | CDB_INSERTIFEXIST = 0x1, 85 | CDB_INSERTIFNOEXIST = 0x2, 86 | CDB_INSERTCACHE = 0x8, 87 | }; 88 | 89 | /* if database path is CDB_MEMDB, records are never written to disk, they stay in cache only */ 90 | #define CDB_MEMDB ":memory:" 91 | 92 | /* 93 | WARNING: 94 | 95 | the library has auxiliary thread, which means do fork() after open a database will cause 96 | unpredictable situation. 97 | */ 98 | 99 | /* create an cuttdb object, which should be freed by cdb_destory() */ 100 | CDB *cdb_new(); 101 | 102 | /* cdb_option() must be called before cdb_open() 103 | 104 | the second parameter 'hsize' indicates the size of main hash table, which can't be 105 | modified after the database be created. To get better performance, it is suggest to 106 | set the 'hsize' to 10% - 1% of the total number of records. The default value 1 million 107 | should be proper for about 100 million records. Too large or small of the value would 108 | lead to drop in speed or waste of memory 109 | 110 | the third parameter 'rcacheMB' indicates the size limit of record cache (measured by 111 | MegaBytes), every record in cache would have about 40 bytes overhead. 112 | 113 | the fourth parameter 'pcacheMB' indicates the size limit of index page cache (measured 114 | by MegaBytes). If a record is not in record cache, it will be read by only 1 disk seek 115 | with enough page cache, or it have to make an extra disk seek to load the page. 116 | cuttdb will use about {10 * number of records} bytes to cache all index pages, which 117 | ensures fastest 'set' operation. 118 | 119 | the default parameter is (_db, 1000000, 128, 1024) 120 | 121 | return 0 if success, or -1 at failure. */ 122 | int cdb_option(CDB *db, int hsize, int rcacheMB, int pcacheMB); 123 | 124 | /* Enable bloomfilter, size should be the estimated number of records in database 125 | must be called before cdb_open(), 126 | The value is 100000 at minimum. Memory cost of bloomfilter is size/8 bytes */ 127 | void cdb_option_bloomfilter(CDB *db, uint64_t size); 128 | 129 | /* this is an advanced parameter. It is the size for cuttdb making a read from disk. 130 | CuttDB do not know the record size even if the index is in memory, 131 | so at least a read with default size will performed while in cdb_get(). 132 | The value is recommended to be larger than the size of most records in database, 133 | unless the records are mostly larger than tens of KB. 134 | If the value is much larger than recommended, it will be a waste of computing. 135 | The value can only be 65536 at maximum, 1024 at minimum */ 136 | void cdb_option_areadsize(CDB *db, uint32_t size); 137 | 138 | /* open an database, 'file' should be an existing directory, or CDB_MEMDB for temporary store, 139 | 'mode' should be combination of CDB_CREAT / CDB_TRUNC / CDB_PAGEWARMUP 140 | CDB_PAGEWARMUP means to warm up page cache while opening 141 | If there is a file called 'force_recovery' in the data directory, even if it might be made by 'touch force_recovery', 142 | a force recovery will happen to rebuild the index (be aware that some deleted records would reappear after this) 143 | */ 144 | int cdb_open(CDB *db, const char *file, int mode); 145 | 146 | 147 | /* simplified cdb_set2, insert a record with CDB_OVERWRITE and never expire */ 148 | int cdb_set(CDB *db, const char *key, int ksize, const char *val, int vsize); 149 | 150 | /* set a record by 'key' and 'value', 151 | opt could be bit combination of CDB_INSERTCACHE and one in {CDB_INSERTIFEXIST, CDB_INSERTNOEXIST, 152 | CDB_OVERWRITE} 153 | expire is the time for the record be valid, measured by second. 0 means never expire. 154 | return 0 if success, or -1 at failure. */ 155 | int cdb_set2(CDB *db, const char *key, int ksize, const char *val, int vsize, int opt, int expire); 156 | 157 | 158 | /* get an record by 'key', the value will be allocated and passed out by 'val', its size is 159 | 'vsize'. return 0 if success, or -1 at failure. */ 160 | int cdb_get(CDB *db, const char *key, int ksize, void **val, int *vsize); 161 | 162 | 163 | /* the val got by cdb_get should be freed by this for safety. 164 | If there is more than one memory allocator */ 165 | void cdb_free_val(void **val); 166 | 167 | 168 | /* delete an record by 'key'. However ,the space of the record would not be recycled. 169 | 'vsize'. return 0 if success, or -1 at failure. */ 170 | int cdb_del(CDB *db, const char *key, int ksize); 171 | 172 | 173 | /* create a new iterator begins at given operation id */ 174 | void *cdb_iterate_new(CDB *db, uint64_t oid); 175 | 176 | /* iterate through the database with a callback, the function would stop if callback returned false 177 | The callback should accept key, ksize, value, vsize, expire time, oid 178 | Returns the number of records have been visited */ 179 | uint64_t cdb_iterate(CDB *db, CDB_ITERCALLBACK itcb, void *arg, void *iter); 180 | 181 | /* destroy the iterator */ 182 | void cdb_iterate_destroy(CDB *db, void *iter); 183 | 184 | /* get the current statistic information of db. 'stat' should be the struct already allocated. 185 | if 'stat' is NULL, the statistic will be reset to zero. */ 186 | void cdb_stat(CDB *db, CDBSTAT *stat); 187 | 188 | 189 | /* close the database. IT MUST BE CALLED BEFORE PROGRAM EXITS TO ENSURE DATA COMPLETION */ 190 | int cdb_close(CDB *db); 191 | 192 | 193 | /* close the database if it opened, and free the object */ 194 | int cdb_destroy(CDB *db); 195 | 196 | 197 | /* get last error number in current thread */ 198 | int cdb_errno(CDB *db); 199 | 200 | 201 | /* get the description of an error number */ 202 | const char *cdb_errmsg(int ecode); 203 | 204 | 205 | /* set callback when error happened, 'cdb_deferrorcb' is optional, which shows the error to stderr */ 206 | void cdb_seterrcb(CDB *db, CDB_ERRCALLBACK errcb, void *arg); 207 | 208 | /* a possible error callback */ 209 | void cdb_deferrorcb(void *arg, int errno, const char *file, int line); 210 | 211 | #if defined(__cplusplus) 212 | } 213 | #endif 214 | 215 | #endif 216 | -------------------------------------------------------------------------------- /src/cuttdb-server.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Beansdb - A high available distributed key-value storage system: 3 | * 4 | * http://beansdb.googlecode.com 5 | * 6 | * The source code of Beansdb is most based on Memcachedb and Memcached: 7 | * 8 | * http://memcachedb.org/ 9 | * http://danga.com/memcached/ 10 | * 11 | * Copyright 2009 Douban Inc. All rights reserved. 12 | * 13 | * Use and distribution licensed under the BSD license. See 14 | * the LICENSE file for full text. 15 | * 16 | * Authors: 17 | * Davies Liu 18 | * 19 | */ 20 | 21 | #ifdef HAVE_CONFIG_H 22 | #include "config.h" 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | #define DATA_BUFFER_SIZE 2048 32 | #define MAX_PAYLOAD_SIZE 1400 33 | #define MAX_SENDBUF_SIZE (256 * 1024 * 1024) 34 | /* I'm told the max legnth of a 64-bit num converted to string is 20 bytes. 35 | * Plus a few for spaces, \r\n, \0 */ 36 | #define SUFFIX_SIZE 24 37 | #define INCR_MAX_STORAGE_LEN 24 38 | 39 | /** Initial size of list of items being returned by "get". */ 40 | #define ITEM_LIST_INITIAL 200 41 | 42 | /** Initial size of the sendmsg() scatter/gather array. */ 43 | #define IOV_LIST_INITIAL 400 44 | 45 | /** Initial number of sendmsg() argument structures to allocate. */ 46 | #define MSG_LIST_INITIAL 10 47 | 48 | /** High water marks for buffer shrinking */ 49 | #define READ_BUFFER_HIGHWAT 8192 50 | #define ITEM_LIST_HIGHWAT 400 51 | #define IOV_LIST_HIGHWAT 600 52 | #define MSG_LIST_HIGHWAT 100 53 | 54 | #define MAX_REP_PRIORITY 1000000 55 | #define MAX_REP_ACK_POLICY 6 56 | #define MAX_REP_NSITES 1000 57 | 58 | 59 | #define RGET_MAX_ITEMS 100 60 | #define PACKAGE "CuttDB" 61 | #define VERSION "0.1.0" 62 | 63 | /* Get a consistent bool type */ 64 | #include 65 | 66 | #if HAVE_STDINT_H 67 | # include 68 | #else 69 | typedef unsigned char uint8_t; 70 | #endif 71 | 72 | /* unistd.h is here */ 73 | #if HAVE_UNISTD_H 74 | # include 75 | #endif 76 | 77 | /* 64-bit Portable printf */ 78 | /* printf macros for size_t, in the style of inttypes.h */ 79 | #ifdef _LP64 80 | #define __PRIS_PREFIX "z" 81 | #else 82 | #define __PRIS_PREFIX 83 | #endif 84 | 85 | #define AE_SETSIZE (1024*60) /* Max number of fd supported */ 86 | 87 | #define AE_OK 0 88 | #define AE_ERR -1 89 | 90 | #define AE_NONE 0 91 | #define AE_READABLE 1 92 | #define AE_WRITABLE 2 93 | 94 | /* Use these macros after a % in a printf format string 95 | to get correct 32/64 bit behavior, like this: 96 | size_t size = records.size(); 97 | printf("%"PRIuS"\n", size); */ 98 | 99 | #define PRIdS __PRIS_PREFIX "d" 100 | #define PRIxS __PRIS_PREFIX "x" 101 | #define PRIuS __PRIS_PREFIX "u" 102 | #define PRIXS __PRIS_PREFIX "X" 103 | #define PRIoS __PRIS_PREFIX "o" 104 | 105 | struct stats { 106 | uint32_t curr_conns; 107 | uint32_t total_conns; 108 | uint32_t conn_structs; 109 | uint64_t get_cmds; 110 | uint64_t set_cmds; 111 | uint64_t delete_cmds; 112 | uint64_t slow_cmds; 113 | uint64_t get_hits; 114 | uint64_t get_misses; 115 | time_t started; /* when the process was started */ 116 | uint64_t bytes_read; 117 | uint64_t bytes_written; 118 | }; 119 | 120 | #define MAX_VERBOSITY_LEVEL 2 121 | 122 | struct settings { 123 | size_t item_buf_size; 124 | int maxconns; 125 | int port; 126 | char *inter; 127 | int verbose; 128 | float slow_cmd_time; 129 | int flush_period; 130 | int flush_limit; 131 | int num_threads; /* number of libevent threads to run */ 132 | }; 133 | 134 | extern struct stats stats; 135 | extern struct settings settings; 136 | 137 | typedef struct _stritem { 138 | int expire; /* expire time */ 139 | uint32_t flag; /* flag of item */ 140 | int nbytes; /* size of data */ 141 | uint8_t nsuffix; /* length of flags-and-length string */ 142 | uint8_t nkey; /* key length, w/terminating null and padding */ 143 | void * end[]; 144 | /* then null-terminated key */ 145 | /* then " flags length\r\n" (no terminating null) */ 146 | /* then data with terminating \r\n (no terminating null; it's binary!) */ 147 | } item; 148 | 149 | #define ITEM_key(item) ((char*)&((item)->end[0])) 150 | 151 | /* warning: don't use these macros with a function, as it evals its arg twice */ 152 | #define ITEM_suffix(item) ((char*) &((item)->end[0]) + (item)->nkey + 1) 153 | #define ITEM_data(item) ((char*) &((item)->end[0]) + (item)->nkey + 1 + (item)->nsuffix) 154 | #define ITEM_ntotal(item) (sizeof(struct _stritem) + (item)->nkey + 1 + (item)->nsuffix + (item)->nbytes) 155 | 156 | enum conn_states { 157 | conn_listening, /** the socket which listens for connections */ 158 | conn_read, /** reading in a command line */ 159 | conn_write, /** writing out a simple response */ 160 | conn_nread, /** reading in a fixed number of bytes */ 161 | conn_swallow, /** swallowing unnecessary bytes w/o storing */ 162 | conn_closing, /** closing this connection */ 163 | conn_mwrite, /** writing out many items sequentially */ 164 | }; 165 | 166 | #define NREAD_ADD 1 167 | #define NREAD_SET 2 168 | #define NREAD_REPLACE 3 169 | #define NREAD_APPEND 4 170 | #define NREAD_PREPEND 5 171 | 172 | typedef struct conn conn; 173 | struct conn { 174 | int sfd; 175 | int state; 176 | short ev_flags; 177 | 178 | char *rbuf; /** buffer to read commands into */ 179 | char *rcurr; /** but if we parsed some already, this is where we stopped */ 180 | int rsize; /** total allocated size of rbuf */ 181 | int rbytes; /** how much data, starting from rcur, do we have unparsed */ 182 | 183 | char *wbuf; 184 | char *wcurr; 185 | int wsize; 186 | int wbytes; 187 | int write_and_go; /** which state to go into after finishing current write */ 188 | void *write_and_free; /** free this memory after finishing writing */ 189 | bool noreply; /* True if the reply should not be sent. */ 190 | 191 | char *ritem; /** when we read in an item's value, it goes here */ 192 | int rlbytes; 193 | 194 | /* data for the nread state */ 195 | 196 | /** 197 | * item is used to hold an item structure created after reading the command 198 | * line of set/add/replace commands, but before we finished reading the actual 199 | * data. The data is read into ITEM_data(item) to avoid extra copying. 200 | */ 201 | 202 | void *item; /* for commands set/add/replace */ 203 | int item_comm; /* which one is it: set/add/replace */ 204 | 205 | /* data for the swallow state */ 206 | int sbytes; /* how many bytes to swallow */ 207 | 208 | /* data for the mwrite state */ 209 | struct iovec *iov; 210 | int iovsize; /* number of elements allocated in iov[] */ 211 | int iovused; /* number of elements used in iov[] */ 212 | 213 | struct msghdr *msglist; 214 | int msgsize; /* number of elements allocated in msglist[] */ 215 | int msgused; /* number of elements used in msglist[] */ 216 | int msgcurr; /* element in msglist[] being transmitted now */ 217 | int msgbytes; /* number of bytes in current msg */ 218 | 219 | item **ilist; /* list of items to write out */ 220 | int isize; 221 | item **icurr; 222 | int ileft; 223 | 224 | conn *next; /* Used for generating a list of conn structures */ 225 | }; 226 | 227 | /* 228 | * Functions 229 | */ 230 | 231 | /* item management */ 232 | /* 233 | void item_init(void); 234 | item *do_item_from_freelist(void); 235 | int do_item_add_to_freelist(item *it); 236 | item *item_alloc1(char *key, const size_t nkey, const int flags, const int nbytes); 237 | int item_free(item *it); 238 | item *item_get(char *key, size_t nkey); 239 | */ 240 | 241 | /* conn management */ 242 | conn *do_conn_from_freelist(); 243 | bool do_conn_add_to_freelist(conn *c); 244 | conn *conn_new(const int sfd, const int init_state, const int read_buffer_size); 245 | 246 | int store_item(item *item, int comm); 247 | 248 | void thread_init(int nthreads); 249 | int add_event(int fd, int mask, conn *c); 250 | void loop_run(int nthreads); 251 | 252 | void drive_machine(conn *c); 253 | 254 | /* Lock wrappers for cache functions that are called from main loop. */ 255 | conn *mt_conn_from_freelist(void); 256 | bool mt_conn_add_to_freelist(conn *c); 257 | item *mt_item_from_freelist(void); 258 | int mt_item_add_to_freelist(item *it); 259 | void mt_stats_lock(void); 260 | void mt_stats_unlock(void); 261 | 262 | #define conn_from_freelist() mt_conn_from_freelist() 263 | #define conn_add_to_freelist(x) mt_conn_add_to_freelist(x) 264 | #define item_from_freelist() mt_item_from_freelist() 265 | #define item_add_to_freelist(x) mt_item_add_to_freelist(x) 266 | #define STATS_LOCK() mt_stats_lock() 267 | #define STATS_UNLOCK() mt_stats_unlock() 268 | 269 | extern int daemon_quit; 270 | 271 | -------------------------------------------------------------------------------- /src/cdb_crc64.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | /************************************************************** 17 | * * 18 | * Fichier : crc64.c * 19 | * Fonction pour calculer le CRC64 * 20 | * * 21 | **************************************************************/ 22 | #include "cdb_crc64.h" 23 | 24 | 25 | #define CONST64(n) (n##ULL) 26 | static uint64_t CRC64_Table[256] = 27 | { 28 | CONST64(0x0000000000000000), CONST64(0x42f0e1eba9ea3693), 29 | CONST64(0x85e1c3d753d46d26), CONST64(0xc711223cfa3e5bb5), 30 | CONST64(0x493366450e42ecdf), CONST64(0x0bc387aea7a8da4c), 31 | CONST64(0xccd2a5925d9681f9), CONST64(0x8e224479f47cb76a), 32 | CONST64(0x9266cc8a1c85d9be), CONST64(0xd0962d61b56fef2d), 33 | CONST64(0x17870f5d4f51b498), CONST64(0x5577eeb6e6bb820b), 34 | CONST64(0xdb55aacf12c73561), CONST64(0x99a54b24bb2d03f2), 35 | CONST64(0x5eb4691841135847), CONST64(0x1c4488f3e8f96ed4), 36 | CONST64(0x663d78ff90e185ef), CONST64(0x24cd9914390bb37c), 37 | CONST64(0xe3dcbb28c335e8c9), CONST64(0xa12c5ac36adfde5a), 38 | CONST64(0x2f0e1eba9ea36930), CONST64(0x6dfeff5137495fa3), 39 | CONST64(0xaaefdd6dcd770416), CONST64(0xe81f3c86649d3285), 40 | CONST64(0xf45bb4758c645c51), CONST64(0xb6ab559e258e6ac2), 41 | CONST64(0x71ba77a2dfb03177), CONST64(0x334a9649765a07e4), 42 | CONST64(0xbd68d2308226b08e), CONST64(0xff9833db2bcc861d), 43 | CONST64(0x388911e7d1f2dda8), CONST64(0x7a79f00c7818eb3b), 44 | CONST64(0xcc7af1ff21c30bde), CONST64(0x8e8a101488293d4d), 45 | CONST64(0x499b3228721766f8), CONST64(0x0b6bd3c3dbfd506b), 46 | CONST64(0x854997ba2f81e701), CONST64(0xc7b97651866bd192), 47 | CONST64(0x00a8546d7c558a27), CONST64(0x4258b586d5bfbcb4), 48 | CONST64(0x5e1c3d753d46d260), CONST64(0x1cecdc9e94ace4f3), 49 | CONST64(0xdbfdfea26e92bf46), CONST64(0x990d1f49c77889d5), 50 | CONST64(0x172f5b3033043ebf), CONST64(0x55dfbadb9aee082c), 51 | CONST64(0x92ce98e760d05399), CONST64(0xd03e790cc93a650a), 52 | CONST64(0xaa478900b1228e31), CONST64(0xe8b768eb18c8b8a2), 53 | CONST64(0x2fa64ad7e2f6e317), CONST64(0x6d56ab3c4b1cd584), 54 | CONST64(0xe374ef45bf6062ee), CONST64(0xa1840eae168a547d), 55 | CONST64(0x66952c92ecb40fc8), CONST64(0x2465cd79455e395b), 56 | CONST64(0x3821458aada7578f), CONST64(0x7ad1a461044d611c), 57 | CONST64(0xbdc0865dfe733aa9), CONST64(0xff3067b657990c3a), 58 | CONST64(0x711223cfa3e5bb50), CONST64(0x33e2c2240a0f8dc3), 59 | CONST64(0xf4f3e018f031d676), CONST64(0xb60301f359dbe0e5), 60 | CONST64(0xda050215ea6c212f), CONST64(0x98f5e3fe438617bc), 61 | CONST64(0x5fe4c1c2b9b84c09), CONST64(0x1d14202910527a9a), 62 | CONST64(0x93366450e42ecdf0), CONST64(0xd1c685bb4dc4fb63), 63 | CONST64(0x16d7a787b7faa0d6), CONST64(0x5427466c1e109645), 64 | CONST64(0x4863ce9ff6e9f891), CONST64(0x0a932f745f03ce02), 65 | CONST64(0xcd820d48a53d95b7), CONST64(0x8f72eca30cd7a324), 66 | CONST64(0x0150a8daf8ab144e), CONST64(0x43a04931514122dd), 67 | CONST64(0x84b16b0dab7f7968), CONST64(0xc6418ae602954ffb), 68 | CONST64(0xbc387aea7a8da4c0), CONST64(0xfec89b01d3679253), 69 | CONST64(0x39d9b93d2959c9e6), CONST64(0x7b2958d680b3ff75), 70 | CONST64(0xf50b1caf74cf481f), CONST64(0xb7fbfd44dd257e8c), 71 | CONST64(0x70eadf78271b2539), CONST64(0x321a3e938ef113aa), 72 | CONST64(0x2e5eb66066087d7e), CONST64(0x6cae578bcfe24bed), 73 | CONST64(0xabbf75b735dc1058), CONST64(0xe94f945c9c3626cb), 74 | CONST64(0x676dd025684a91a1), CONST64(0x259d31cec1a0a732), 75 | CONST64(0xe28c13f23b9efc87), CONST64(0xa07cf2199274ca14), 76 | CONST64(0x167ff3eacbaf2af1), CONST64(0x548f120162451c62), 77 | CONST64(0x939e303d987b47d7), CONST64(0xd16ed1d631917144), 78 | CONST64(0x5f4c95afc5edc62e), CONST64(0x1dbc74446c07f0bd), 79 | CONST64(0xdaad56789639ab08), CONST64(0x985db7933fd39d9b), 80 | CONST64(0x84193f60d72af34f), CONST64(0xc6e9de8b7ec0c5dc), 81 | CONST64(0x01f8fcb784fe9e69), CONST64(0x43081d5c2d14a8fa), 82 | CONST64(0xcd2a5925d9681f90), CONST64(0x8fdab8ce70822903), 83 | CONST64(0x48cb9af28abc72b6), CONST64(0x0a3b7b1923564425), 84 | CONST64(0x70428b155b4eaf1e), CONST64(0x32b26afef2a4998d), 85 | CONST64(0xf5a348c2089ac238), CONST64(0xb753a929a170f4ab), 86 | CONST64(0x3971ed50550c43c1), CONST64(0x7b810cbbfce67552), 87 | CONST64(0xbc902e8706d82ee7), CONST64(0xfe60cf6caf321874), 88 | CONST64(0xe224479f47cb76a0), CONST64(0xa0d4a674ee214033), 89 | CONST64(0x67c58448141f1b86), CONST64(0x253565a3bdf52d15), 90 | CONST64(0xab1721da49899a7f), CONST64(0xe9e7c031e063acec), 91 | CONST64(0x2ef6e20d1a5df759), CONST64(0x6c0603e6b3b7c1ca), 92 | CONST64(0xf6fae5c07d3274cd), CONST64(0xb40a042bd4d8425e), 93 | CONST64(0x731b26172ee619eb), CONST64(0x31ebc7fc870c2f78), 94 | CONST64(0xbfc9838573709812), CONST64(0xfd39626eda9aae81), 95 | CONST64(0x3a28405220a4f534), CONST64(0x78d8a1b9894ec3a7), 96 | CONST64(0x649c294a61b7ad73), CONST64(0x266cc8a1c85d9be0), 97 | CONST64(0xe17dea9d3263c055), CONST64(0xa38d0b769b89f6c6), 98 | CONST64(0x2daf4f0f6ff541ac), CONST64(0x6f5faee4c61f773f), 99 | CONST64(0xa84e8cd83c212c8a), CONST64(0xeabe6d3395cb1a19), 100 | CONST64(0x90c79d3fedd3f122), CONST64(0xd2377cd44439c7b1), 101 | CONST64(0x15265ee8be079c04), CONST64(0x57d6bf0317edaa97), 102 | CONST64(0xd9f4fb7ae3911dfd), CONST64(0x9b041a914a7b2b6e), 103 | CONST64(0x5c1538adb04570db), CONST64(0x1ee5d94619af4648), 104 | CONST64(0x02a151b5f156289c), CONST64(0x4051b05e58bc1e0f), 105 | CONST64(0x87409262a28245ba), CONST64(0xc5b073890b687329), 106 | CONST64(0x4b9237f0ff14c443), CONST64(0x0962d61b56fef2d0), 107 | CONST64(0xce73f427acc0a965), CONST64(0x8c8315cc052a9ff6), 108 | CONST64(0x3a80143f5cf17f13), CONST64(0x7870f5d4f51b4980), 109 | CONST64(0xbf61d7e80f251235), CONST64(0xfd913603a6cf24a6), 110 | CONST64(0x73b3727a52b393cc), CONST64(0x31439391fb59a55f), 111 | CONST64(0xf652b1ad0167feea), CONST64(0xb4a25046a88dc879), 112 | CONST64(0xa8e6d8b54074a6ad), CONST64(0xea16395ee99e903e), 113 | CONST64(0x2d071b6213a0cb8b), CONST64(0x6ff7fa89ba4afd18), 114 | CONST64(0xe1d5bef04e364a72), CONST64(0xa3255f1be7dc7ce1), 115 | CONST64(0x64347d271de22754), CONST64(0x26c49cccb40811c7), 116 | CONST64(0x5cbd6cc0cc10fafc), CONST64(0x1e4d8d2b65facc6f), 117 | CONST64(0xd95caf179fc497da), CONST64(0x9bac4efc362ea149), 118 | CONST64(0x158e0a85c2521623), CONST64(0x577eeb6e6bb820b0), 119 | CONST64(0x906fc95291867b05), CONST64(0xd29f28b9386c4d96), 120 | CONST64(0xcedba04ad0952342), CONST64(0x8c2b41a1797f15d1), 121 | CONST64(0x4b3a639d83414e64), CONST64(0x09ca82762aab78f7), 122 | CONST64(0x87e8c60fded7cf9d), CONST64(0xc51827e4773df90e), 123 | CONST64(0x020905d88d03a2bb), CONST64(0x40f9e43324e99428), 124 | CONST64(0x2cffe7d5975e55e2), CONST64(0x6e0f063e3eb46371), 125 | CONST64(0xa91e2402c48a38c4), CONST64(0xebeec5e96d600e57), 126 | CONST64(0x65cc8190991cb93d), CONST64(0x273c607b30f68fae), 127 | CONST64(0xe02d4247cac8d41b), CONST64(0xa2dda3ac6322e288), 128 | CONST64(0xbe992b5f8bdb8c5c), CONST64(0xfc69cab42231bacf), 129 | CONST64(0x3b78e888d80fe17a), CONST64(0x7988096371e5d7e9), 130 | CONST64(0xf7aa4d1a85996083), CONST64(0xb55aacf12c735610), 131 | CONST64(0x724b8ecdd64d0da5), CONST64(0x30bb6f267fa73b36), 132 | CONST64(0x4ac29f2a07bfd00d), CONST64(0x08327ec1ae55e69e), 133 | CONST64(0xcf235cfd546bbd2b), CONST64(0x8dd3bd16fd818bb8), 134 | CONST64(0x03f1f96f09fd3cd2), CONST64(0x41011884a0170a41), 135 | CONST64(0x86103ab85a2951f4), CONST64(0xc4e0db53f3c36767), 136 | CONST64(0xd8a453a01b3a09b3), CONST64(0x9a54b24bb2d03f20), 137 | CONST64(0x5d45907748ee6495), CONST64(0x1fb5719ce1045206), 138 | CONST64(0x919735e51578e56c), CONST64(0xd367d40ebc92d3ff), 139 | CONST64(0x1476f63246ac884a), CONST64(0x568617d9ef46bed9), 140 | CONST64(0xe085162ab69d5e3c), CONST64(0xa275f7c11f7768af), 141 | CONST64(0x6564d5fde549331a), CONST64(0x279434164ca30589), 142 | CONST64(0xa9b6706fb8dfb2e3), CONST64(0xeb46918411358470), 143 | CONST64(0x2c57b3b8eb0bdfc5), CONST64(0x6ea7525342e1e956), 144 | CONST64(0x72e3daa0aa188782), CONST64(0x30133b4b03f2b111), 145 | CONST64(0xf7021977f9cceaa4), CONST64(0xb5f2f89c5026dc37), 146 | CONST64(0x3bd0bce5a45a6b5d), CONST64(0x79205d0e0db05dce), 147 | CONST64(0xbe317f32f78e067b), CONST64(0xfcc19ed95e6430e8), 148 | CONST64(0x86b86ed5267cdbd3), CONST64(0xc4488f3e8f96ed40), 149 | CONST64(0x0359ad0275a8b6f5), CONST64(0x41a94ce9dc428066), 150 | CONST64(0xcf8b0890283e370c), CONST64(0x8d7be97b81d4019f), 151 | CONST64(0x4a6acb477bea5a2a), CONST64(0x089a2aacd2006cb9), 152 | CONST64(0x14dea25f3af9026d), CONST64(0x562e43b4931334fe), 153 | CONST64(0x913f6188692d6f4b), CONST64(0xd3cf8063c0c759d8), 154 | CONST64(0x5dedc41a34bbeeb2), CONST64(0x1f1d25f19d51d821), 155 | CONST64(0xd80c07cd676f8394), CONST64(0x9afce626ce85b507) 156 | }; 157 | 158 | 159 | uint64_t cdb_crc64(const void *buf, uint32_t len) 160 | { 161 | uint32_t i; 162 | uint64_t crc = 0xFFFFFFFFFFFFFFFF; 163 | uint8_t *cbuf = (uint8_t *)buf; 164 | 165 | for (i = 0; i < len; i++) { 166 | crc = CRC64_Table[(uint8_t)(crc >> 56) ^ *cbuf++] ^ (crc << 8); 167 | } 168 | return crc; 169 | } 170 | 171 | -------------------------------------------------------------------------------- /src/cdb_hashtable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cdb_hashtable.h" 17 | #include 18 | #include 19 | 20 | /* 21 | #define LRUPREV(i) (*(CDBHTITEM**)&((i)->buf[0])) 22 | #define LRUNEXT(i) (*(CDBHTITEM**)&((i)->buf[sizeof(void*)])) 23 | */ 24 | 25 | #define LRUPREV(i) ((i)->lruptr[0]) 26 | #define LRUNEXT(i) ((i)->lruptr[1]) 27 | 28 | static uint32_t MurmurHash1( const void * key, int len) 29 | { 30 | const unsigned int m = 0xc6a4a793; 31 | const int r = 16; 32 | unsigned int h = 0x19900917 ^ (len * m); 33 | const unsigned char * data = (const unsigned char *)key; 34 | 35 | while(len >= 4) 36 | { 37 | unsigned int k = *(unsigned int *)data; 38 | h += k; h *= m; h ^= h >> 16; 39 | data += 4; len -= 4; 40 | } 41 | 42 | switch(len) 43 | { 44 | case 3: 45 | h += data[2] << 16; 46 | case 2: 47 | h += data[1] << 8; 48 | case 1: 49 | h += data[0]; 50 | h *= m; 51 | h ^= h >> r; 52 | }; 53 | 54 | h *= m; h ^= h >> 10; 55 | h *= m; h ^= h >> 17; 56 | return h; 57 | } 58 | 59 | void *cdb_ht_itemkey(CDBHASHTABLE *ht, CDBHTITEM *item) 60 | { 61 | return (void *)(item->buf + ht->lru * 2 * sizeof(void*)); 62 | } 63 | 64 | void *cdb_ht_itemval(CDBHASHTABLE *ht, CDBHTITEM *item) 65 | { 66 | return (void *)(item->buf + ht->lru * 2 * sizeof(void*) + item->ksize); 67 | } 68 | 69 | CDBHASHTABLE *cdb_ht_new(bool lru, CDBHASHFUNC hashfunc) 70 | { 71 | CDBHASHTABLE *ht; 72 | 73 | ht = (CDBHASHTABLE*)malloc(sizeof(CDBHASHTABLE)); 74 | ht->hash = NULL; 75 | ht->lru = lru; 76 | ht->num = ht->size = 0; 77 | ht->tail = ht->head = NULL; 78 | for(uint32_t i = 0; i < (1<buckets[i]); 80 | bucket->bnum = 2; 81 | uint32_t lsize = sizeof(CDBHTITEM *) * bucket->bnum; 82 | bucket->rnum = 0; 83 | bucket->items = (CDBHTITEM **)malloc(lsize); 84 | ht->size += lsize; 85 | memset(bucket->items, 0, lsize); 86 | } 87 | ht->hash = hashfunc; 88 | if (ht->hash == NULL) 89 | ht->hash = MurmurHash1; 90 | 91 | ht->size += sizeof(CDBHASHTABLE); 92 | 93 | return ht; 94 | } 95 | 96 | CDBHTITEM *cdb_ht_newitem(CDBHASHTABLE *ht, int ksize, int vsize) 97 | { 98 | CDBHTITEM *item; 99 | int hsize; 100 | 101 | if (ht->lru) 102 | hsize = sizeof(CDBHTITEM) + 2 * sizeof(void*); 103 | else 104 | hsize = sizeof(CDBHTITEM); 105 | 106 | item = (CDBHTITEM*)malloc(hsize + ksize + vsize); 107 | item->ksize = ksize; 108 | item->vsize = vsize; 109 | if (ht->lru) { 110 | LRUPREV(item) = NULL; 111 | LRUNEXT(item) = NULL; 112 | } 113 | return item; 114 | } 115 | 116 | 117 | 118 | 119 | void cdb_ht_insert(CDBHASHTABLE *ht, CDBHTITEM *item) 120 | { 121 | uint32_t bid, hid; 122 | CDBHTBUCKET *bucket; 123 | 124 | item->hash = ht->hash(cdb_ht_itemkey(ht, item), item->ksize); 125 | bid = item->hash & ((1<buckets[bid]); 127 | hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum-1); 128 | 129 | if (bucket->rnum > bucket->bnum * 2) { 130 | CDBHTITEM **ilist; 131 | uint32_t exp = 2; 132 | if (bucket->bnum < 512) 133 | exp = 4; 134 | int listsize = (bucket->bnum * exp) * sizeof(CDBHTITEM*); 135 | ilist = (CDBHTITEM**)malloc(listsize); 136 | memset(ilist, 0, listsize); 137 | for(uint32_t i = 0; i < bucket->bnum; i++) { 138 | CDBHTITEM *curitem = bucket->items[i]; 139 | while(curitem != NULL) { 140 | CDBHTITEM *nextitem = curitem->hnext; 141 | uint32_t hid = (curitem->hash>>CDBHTBNUMPOW) 142 | & (bucket->bnum * exp - 1); 143 | curitem->hnext = ilist[hid]; 144 | ilist[hid] = curitem; 145 | curitem = nextitem; 146 | } 147 | } 148 | free(bucket->items); 149 | bucket->items = ilist; 150 | ht->size += listsize - bucket->bnum * sizeof(CDBHTITEM *); 151 | bucket->bnum *= exp; 152 | hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); 153 | } 154 | 155 | { 156 | CDBHTITEM *curitem = bucket->items[hid]; 157 | CDBHTITEM *preitem = NULL; 158 | while(curitem != NULL) { 159 | if (curitem->hash == item->hash 160 | && curitem->ksize == item->ksize 161 | && memcmp(cdb_ht_itemkey(ht, curitem), 162 | cdb_ht_itemkey(ht, item) ,curitem->ksize) == 0) { 163 | CDBHTITEM *tmp; 164 | if (ht->lru) { 165 | if (LRUPREV(curitem)) 166 | LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); 167 | if (LRUNEXT(curitem)) 168 | LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); 169 | if (ht->head == curitem) 170 | ht->head = LRUNEXT(curitem); 171 | if (ht->tail == curitem) 172 | ht->tail = LRUPREV(curitem); 173 | } 174 | if (preitem) 175 | preitem->hnext = curitem->hnext; 176 | else 177 | bucket->items[hid] = curitem->hnext; 178 | tmp = curitem->hnext; 179 | ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize 180 | + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2; 181 | ht->num--; 182 | bucket->rnum--; 183 | free(curitem); 184 | curitem = tmp; 185 | break; 186 | } 187 | preitem = curitem; 188 | curitem = curitem->hnext; 189 | } 190 | } 191 | 192 | item->hnext = bucket->items[hid]; 193 | bucket->items[hid] = item; 194 | 195 | if (ht->lru) { 196 | if (ht->head) LRUPREV(ht->head) = item; 197 | LRUPREV(item) = NULL; 198 | LRUNEXT(item) = ht->head; 199 | ht->head = item; 200 | if (ht->tail == NULL) 201 | ht->tail = item; 202 | } 203 | 204 | bucket->rnum++; 205 | ht->num++; 206 | ht->size += sizeof(CDBHTITEM) + item->ksize + item->vsize 207 | + ht->lru * sizeof(CDBHTITEM*) * 2; 208 | } 209 | 210 | 211 | void *cdb_ht_insert2(CDBHASHTABLE *ht, const void *key, int ksize, const void *val, int vsize) 212 | { 213 | CDBHTITEM *item; 214 | 215 | item = cdb_ht_newitem(ht, ksize, vsize); 216 | memcpy(cdb_ht_itemkey(ht, item), key, ksize); 217 | memcpy(cdb_ht_itemval(ht, item), val, vsize); 218 | cdb_ht_insert(ht, item); 219 | return cdb_ht_itemval(ht, item); 220 | } 221 | 222 | void *cdb_ht_get(CDBHASHTABLE *ht, const void *key, int ksize, int *vsize, bool mtf) 223 | { 224 | CDBHTITEM *res; 225 | 226 | res = cdb_ht_get3(ht, key, ksize, mtf); 227 | if (res) { 228 | *vsize = res->vsize; 229 | return cdb_ht_itemval(ht, res); 230 | } else { 231 | *vsize = 0; 232 | return NULL; 233 | } 234 | } 235 | 236 | 237 | void *cdb_ht_get2(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf) 238 | { 239 | CDBHTITEM *res; 240 | 241 | res = cdb_ht_get3(ht, key, ksize, mtf); 242 | if (res) 243 | return cdb_ht_itemval(ht, res); 244 | else 245 | return NULL; 246 | } 247 | 248 | 249 | CDBHTITEM *cdb_ht_get3(CDBHASHTABLE *ht, const void *key, int ksize, bool mtf) 250 | { 251 | uint32_t hash, bid, hid; 252 | CDBHTBUCKET *bucket; 253 | CDBHTITEM *curitem; 254 | 255 | hash = ht->hash(key, ksize); 256 | bid = hash & ((1<buckets[bid]); 258 | hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); 259 | 260 | curitem = bucket->items[hid]; 261 | while (curitem != NULL) { 262 | if (curitem->hash == hash 263 | && curitem->ksize == ksize 264 | && memcmp(cdb_ht_itemkey(ht, curitem), key , ksize) == 0) { 265 | if (ht->lru && mtf && ht->head != curitem) { 266 | if (LRUPREV(curitem)) 267 | LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); 268 | if (LRUNEXT(curitem)) 269 | LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); 270 | if (ht->tail == curitem) 271 | ht->tail = LRUPREV(curitem); 272 | 273 | LRUNEXT(curitem) = ht->head; 274 | LRUPREV(ht->head) = curitem; 275 | ht->head = curitem; 276 | LRUPREV(curitem) = NULL; 277 | } 278 | return curitem; 279 | } 280 | curitem = curitem->hnext; 281 | } 282 | return NULL; 283 | } 284 | 285 | 286 | bool cdb_ht_exist(CDBHASHTABLE *ht, const void *key, int ksize) 287 | { 288 | int vsize; 289 | return (cdb_ht_get(ht, key, ksize, &vsize, false) != NULL); 290 | } 291 | 292 | 293 | int cdb_ht_del2(CDBHASHTABLE *ht, const void *key, int ksize) 294 | { 295 | CDBHTITEM *res = NULL; 296 | res = cdb_ht_del(ht, key, ksize); 297 | if (res) { 298 | free(res); 299 | return 0; 300 | } 301 | return -1; 302 | } 303 | 304 | 305 | CDBHTITEM *cdb_ht_del(CDBHASHTABLE *ht, const void *key, int ksize) 306 | { 307 | uint32_t hash, bid, hid; 308 | CDBHTBUCKET *bucket; 309 | CDBHTITEM *curitem, *preitem; 310 | CDBHTITEM *res = NULL; 311 | 312 | hash = ht->hash(key, ksize); 313 | bid = hash & ((1<buckets[bid]); 315 | hid = (hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); 316 | 317 | curitem = bucket->items[hid]; 318 | preitem = NULL; 319 | while(curitem != NULL) { 320 | if (curitem->hash == hash 321 | && curitem->ksize == ksize 322 | && memcmp(cdb_ht_itemkey(ht, curitem), 323 | key, ksize) == 0) { 324 | if (ht->lru) { 325 | if (LRUPREV(curitem)) 326 | LRUNEXT(LRUPREV(curitem)) = LRUNEXT(curitem); 327 | if (LRUNEXT(curitem)) 328 | LRUPREV(LRUNEXT(curitem)) = LRUPREV(curitem); 329 | if (ht->head == curitem) 330 | ht->head = LRUNEXT(curitem); 331 | if (ht->tail == curitem) 332 | ht->tail = LRUPREV(curitem); 333 | } 334 | if (preitem) 335 | preitem->hnext = curitem->hnext; 336 | else 337 | bucket->items[hid] = curitem->hnext; 338 | ht->size -= sizeof(CDBHTITEM) + curitem->ksize + curitem->vsize 339 | + (ht->lru > 0) * sizeof(CDBHTITEM*) * 2; 340 | ht->num--; 341 | bucket->rnum--; 342 | res = curitem; 343 | curitem = curitem->hnext; 344 | break; 345 | } 346 | preitem = curitem; 347 | curitem = curitem->hnext; 348 | } 349 | 350 | return res; 351 | } 352 | 353 | 354 | void cdb_ht_removetail(CDBHASHTABLE *ht) 355 | { 356 | CDBHTITEM *item; 357 | 358 | item = cdb_ht_poptail(ht); 359 | if (item) 360 | free(item); 361 | return; 362 | } 363 | 364 | 365 | CDBHTITEM *cdb_ht_gettail(CDBHASHTABLE *ht) 366 | { 367 | return ht->tail; 368 | } 369 | 370 | 371 | CDBHTITEM *cdb_ht_poptail(CDBHASHTABLE *ht) 372 | { 373 | CDBHTITEM *item = ht->tail, *curitem, *preitem;; 374 | CDBHTBUCKET *bucket; 375 | uint32_t bid, hid; 376 | 377 | if (!(ht->lru) || item == NULL) 378 | return NULL; 379 | 380 | bid = item->hash & ((1<buckets[bid]); 382 | hid = (item->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); 383 | 384 | curitem = bucket->items[hid]; 385 | preitem = NULL; 386 | while (curitem != NULL) { 387 | if (curitem->hash == item->hash 388 | && curitem->ksize == item->ksize 389 | && memcmp(cdb_ht_itemkey(ht, curitem), 390 | cdb_ht_itemkey(ht, item), item->ksize) == 0) { 391 | if (preitem) { 392 | preitem->hnext = curitem->hnext; 393 | } else { 394 | bucket->items[hid] = curitem->hnext; 395 | } 396 | break; 397 | } 398 | preitem = curitem; 399 | curitem = curitem->hnext; 400 | } 401 | 402 | if (LRUPREV(item)) 403 | LRUNEXT(LRUPREV(item)) = NULL; 404 | if (ht->head == item) 405 | ht->head = NULL; 406 | ht->tail = LRUPREV(item); 407 | bucket->rnum--; 408 | ht->num--; 409 | ht->size -= sizeof(CDBHTITEM) + item->ksize + item->vsize 410 | + sizeof(CDBHTITEM*) * 2; 411 | return item; 412 | } 413 | 414 | void cdb_ht_clean(CDBHASHTABLE *ht) 415 | { 416 | for(uint32_t i = 0; i < (1<buckets[i]); 418 | for(uint32_t j = 0; j < bucket->bnum; j++) { 419 | CDBHTITEM *curitem = bucket->items[j]; 420 | while(curitem != NULL) { 421 | CDBHTITEM *tmp = curitem->hnext; 422 | free(curitem); 423 | curitem = tmp; 424 | } 425 | bucket->items[j] = NULL; 426 | } 427 | bucket->rnum = 0; 428 | } 429 | ht->num = 0; 430 | } 431 | 432 | 433 | void cdb_ht_destroy(CDBHASHTABLE *ht) 434 | { 435 | if (ht->lru) { 436 | CDBHTITEM *curitem = ht->head; 437 | while(curitem) { 438 | CDBHTITEM *nextitem = LRUNEXT(curitem); 439 | free(curitem); 440 | curitem = nextitem; 441 | } 442 | } 443 | 444 | for(uint32_t i = 0; i < (1<buckets[i]); 446 | 447 | for(uint32_t j = 0; j < bucket->bnum && (!ht->lru); j++) { 448 | CDBHTITEM *curitem = bucket->items[j]; 449 | while(curitem != NULL) { 450 | CDBHTITEM *tmp = curitem->hnext; 451 | free(curitem); 452 | curitem = tmp; 453 | } 454 | } 455 | free(bucket->items); 456 | } 457 | free(ht); 458 | } 459 | 460 | 461 | CDBHTITEM *cdb_ht_iterbegin(CDBHASHTABLE *ht) 462 | { 463 | for(uint32_t i = 0; i < (1<buckets[i]); 465 | if (!bucket->rnum) 466 | continue; 467 | for(uint32_t j = 0; j < bucket->bnum; j++) 468 | if (bucket->items[j]) 469 | return bucket->items[j]; 470 | } 471 | 472 | return NULL; 473 | } 474 | 475 | 476 | CDBHTITEM *cdb_ht_iternext(CDBHASHTABLE *ht, CDBHTITEM *cur) 477 | { 478 | if (cur == NULL) 479 | return NULL; 480 | 481 | if (cur->hnext) 482 | return cur->hnext; 483 | 484 | uint32_t bid = cur->hash & ((1<buckets[bid]); 486 | uint32_t hid = (cur->hash >> CDBHTBNUMPOW) & (bucket->bnum - 1); 487 | 488 | for(uint32_t i = hid + 1; i < bucket->bnum; i++) { 489 | if (bucket->items[i]) 490 | return bucket->items[i]; 491 | } 492 | 493 | for(uint32_t i = bid + 1; i < (1<buckets[i]); 495 | if (!bucket->rnum) 496 | continue; 497 | for(int j = 0; j < bucket->bnum; j++) 498 | if (bucket->items[j]) 499 | return bucket->items[j]; 500 | } 501 | 502 | return NULL; 503 | } 504 | 505 | 506 | #ifdef _UT_ 507 | #include 508 | #include 509 | int main(int argc, char *argv[]) 510 | { 511 | CDBHASHTABLE *ht; 512 | long k, v; 513 | ht = cdb_ht_new(true, NULL); 514 | for(int i = 0; i < 1000; i++) { 515 | k = i; 516 | v = i * 1000; 517 | cdb_ht_insert2(ht, &k, sizeof(long), &v, sizeof(long)); 518 | } 519 | 520 | srand(time(NULL)); 521 | 522 | for(int i = 0; i < 1000; i++) { 523 | long *v, k = rand() % 1000; 524 | int vsize; 525 | v = (long*)cdb_ht_get(ht, &k, sizeof(long), &vsize, true); 526 | printf("get: %ld -> %ld (%d)\n", k, *v, vsize); 527 | } 528 | 529 | printf("total size: %d num: %d\n", ht->size, ht->num); 530 | 531 | CDBHTITEM *item; 532 | item = cdb_ht_poptail(ht); 533 | printf("tail: %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item)); 534 | free(item); 535 | item = cdb_ht_poptail(ht); 536 | printf("tail: %ld - %ld\n", *(long*)cdb_ht_itemkey(ht, item), *(long*)cdb_ht_itemval(ht, item)); 537 | free(item); 538 | } 539 | #endif 540 | -------------------------------------------------------------------------------- /src/cdb_core.c: -------------------------------------------------------------------------------- 1 | /* 2 | * CuttDB - a fast key-value storage engine 3 | * 4 | * 5 | * http://code.google.com/p/cuttdb/ 6 | * 7 | * Copyright (c) 2012, Siyuan Fu. All rights reserved. 8 | * Use and distribution licensed under the BSD license. 9 | * See the LICENSE file for full text 10 | * 11 | * Author: Siyuan Fu 12 | * 13 | */ 14 | 15 | 16 | #include "cuttdb.h" 17 | #include "cdb_crc64.h" 18 | #include "cdb_types.h" 19 | #include "cdb_hashtable.h" 20 | #include "cdb_bloomfilter.h" 21 | #include "cdb_lock.h" 22 | #include "cdb_bgtask.h" 23 | #include "cdb_errno.h" 24 | #include "cdb_vio.h" 25 | #include "cdb_core.h" 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | static void _cdb_pageout(CDB *db); 32 | static void _cdb_defparam(CDB *db); 33 | static void _cdb_recout(CDB *db); 34 | static uint32_t _pagehash(const void *key, int len); 35 | static void _cdb_flushdpagetask(void *arg); 36 | static void _cdb_timerreset(struct timespec *ts); 37 | static uint32_t _cdb_timermicrosec(struct timespec *ts); 38 | static void _cdb_pagewarmup(CDB *db, bool loadbf); 39 | 40 | 41 | /* it isn't necessary to rehash bid in hash table cache */ 42 | static uint32_t _pagehash(const void *key, int len) 43 | { 44 | return *(uint32_t*)key; 45 | } 46 | 47 | 48 | /* used to get the duration of a procedure */ 49 | static void _cdb_timerreset(struct timespec *ts) 50 | { 51 | clock_gettime(CLOCK_MONOTONIC, ts); 52 | } 53 | 54 | 55 | static uint32_t _cdb_timermicrosec(struct timespec *ts) 56 | { 57 | struct timespec ts2; 58 | uint32_t diff; 59 | clock_gettime(CLOCK_MONOTONIC, &ts2); 60 | diff = (ts2.tv_sec - ts->tv_sec) * 1000000; 61 | diff += ts2.tv_nsec / 1000; 62 | diff -= ts->tv_nsec / 1000; 63 | return diff; 64 | } 65 | 66 | 67 | /* reset the parameters */ 68 | static void _cdb_defparam(CDB *db) 69 | { 70 | db->rnum = 0; 71 | db->bfsize = 0; 72 | db->rclimit = 128 * MB; 73 | db->pclimit = 1024 * MB; 74 | db->hsize = 1000000; 75 | db->rcache = db->pcache = db->dpcache = NULL; 76 | db->bf = NULL; 77 | db->opened = false; 78 | db->vio = NULL; 79 | db->mtable = NULL; 80 | db->oid = 0; 81 | db->roid = 0; 82 | db->errcbarg = NULL; 83 | db->errcb = NULL; 84 | db->areadsize = 4 * KB; 85 | return; 86 | } 87 | 88 | 89 | /* flush all dirty pages */ 90 | void cdb_flushalldpage(CDB *db) 91 | { 92 | if (db->dpcache) { 93 | while (db->dpcache->num) { 94 | CDBHTITEM *item = cdb_ht_poptail(db->dpcache); 95 | uint32_t bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); 96 | FOFF off; 97 | db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off); 98 | db->mtable[bid] = off; 99 | free(item); 100 | } 101 | 102 | db->roid = db->oid; 103 | db->vio->cleanpoint(db->vio); 104 | } 105 | } 106 | 107 | 108 | /* flush oldest dirty index page to disk, it runs in another thread and triggered by timer */ 109 | static void _cdb_flushdpagetask(void *arg) 110 | { 111 | CDB *db = (CDB *)arg; 112 | CDBHTITEM *item; 113 | CDBPAGE *page; 114 | time_t now = time(NULL); 115 | bool cleandcache = false; 116 | uint32_t bid; 117 | 118 | if (!db->dpcache) 119 | /* no dirty page cache */ 120 | return; 121 | 122 | /* if there isn't too much dirty page and some time passed since last clean, 123 | write out all dirty pages to make a recovery point(oid) */ 124 | if (db->dpcache->num < 1024 && now > db->ndpltime + 120) 125 | cleandcache = true; 126 | 127 | while(db->dpcache->num) { 128 | FOFF off; 129 | cdb_lock_lock(db->dpclock); 130 | item = cdb_ht_gettail(db->dpcache); 131 | /* no item in dpcache after lock */ 132 | if (item == NULL) { 133 | cdb_lock_unlock(db->dpclock); 134 | return; 135 | } 136 | page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, item); 137 | /* bid = page->bid; also OK */ 138 | bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); 139 | /* been dirty for too long? */ 140 | if (now > page->mtime + DPAGETIMEOUT || cleandcache) { 141 | if (cdb_lock_trylock(db->mlock[page->bid % MLOCKNUM])) { 142 | /* avoid dead lock, since dpclock is holding */ 143 | cdb_lock_unlock(db->dpclock); 144 | return; 145 | } 146 | /* remove it from dpcache */ 147 | cdb_ht_poptail(db->dpcache); 148 | cdb_lock_unlock(db->dpclock); 149 | 150 | /* write to disk */ 151 | struct timespec ts; 152 | _cdb_timerreset(&ts); 153 | db->vio->wpage(db->vio, page, &off); 154 | db->wcount++; 155 | db->wtime += _cdb_timermicrosec(&ts); 156 | db->mtable[bid] = off; 157 | 158 | /* move the clean page into pcache */ 159 | cdb_lock_lock(db->pclock); 160 | cdb_ht_insert(db->pcache, item); 161 | cdb_lock_unlock(db->pclock); 162 | cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 163 | } else { 164 | /* tail in dpcache isn't expired */ 165 | cdb_lock_unlock(db->dpclock); 166 | return; 167 | } 168 | } 169 | 170 | if (db->dpcache->num == 0 && cleandcache) 171 | db->ndpltime = now; 172 | 173 | if (cleandcache) { 174 | /* clean succeed if goes here, remember the recovery point */ 175 | /* it's not necessary to lock */ 176 | db->roid = db->oid; 177 | db->vio->cleanpoint(db->vio); 178 | } 179 | } 180 | 181 | 182 | /* fill the index page cache, and set the bloomfilter if necessary */ 183 | static void _cdb_pagewarmup(CDB *db, bool loadbf) 184 | { 185 | char sbuf[SBUFSIZE]; 186 | void *it = db->vio->pageitfirst(db->vio, 0); 187 | 188 | if (it == NULL) 189 | return; 190 | 191 | for(;;) { 192 | CDBPAGE *page = (CDBPAGE *)sbuf; 193 | if (db->vio->pageitnext(db->vio, &page, it) < 0) 194 | break; 195 | 196 | /* the page is the newest one because its offset matches the one in main table */ 197 | if (OFFEQ(page->ooff, db->mtable[page->bid])) { 198 | if (loadbf) { 199 | /* iterate key hashes in page, set to the filter */ 200 | cdb_lock_lock(db->bflock); 201 | for(uint32_t i = 0; i < page->num; i++) { 202 | uint64_t hash = (page->bid << 24) | (page->items[i].hash.i2 << 8) 203 | | (page->items[i].hash.i1); 204 | /* bloom filter use the combined record hash as key */ 205 | cdb_bf_set(db->bf, &hash, SI8); 206 | } 207 | cdb_lock_unlock(db->bflock); 208 | } 209 | 210 | /* set the page to pcache if it doesn't exceed the limit size */ 211 | if (db->pcache && db->pcache->size < db->pclimit) { 212 | cdb_lock_lock(db->pclock); 213 | cdb_ht_insert2(db->pcache, &page->bid, SI4, page, MPAGESIZE(page)); 214 | cdb_lock_unlock(db->pclock); 215 | } 216 | } 217 | /* the page may not be still in stack */ 218 | if (page != (CDBPAGE *)sbuf) 219 | free(page); 220 | 221 | if (!loadbf && (db->pcache && db->pcache->size > db->pclimit)) 222 | break; 223 | } 224 | 225 | db->vio->pageitdestroy(db->vio, it); 226 | } 227 | 228 | 229 | /* generate an incremental global operation id */ 230 | uint64_t cdb_genoid(CDB *db) 231 | { 232 | uint64_t oid; 233 | cdb_lock_lock(db->oidlock); 234 | oid = db->oid++; 235 | cdb_lock_unlock(db->oidlock); 236 | return oid; 237 | } 238 | 239 | 240 | /* get a new record iterator */ 241 | void *cdb_iterate_new(CDB *db, uint64_t oid) 242 | { 243 | return db->vio->recitfirst(db->vio, oid); 244 | } 245 | 246 | 247 | 248 | /* iterate the database by callback */ 249 | uint64_t cdb_iterate(CDB *db, CDB_ITERCALLBACK itcb, void *arg, void *iter) 250 | { 251 | char sbuf[SBUFSIZE]; 252 | uint64_t cnt = 0; 253 | 254 | if (iter == NULL) 255 | return cnt; 256 | for(;;) { 257 | /* the rec is a copy from file, may in stack or allocated in heap */ 258 | CDBREC *rec = (CDBREC *)sbuf; 259 | bool ret = true; 260 | if (db->vio->recitnext(db->vio, &rec, iter) < 0) 261 | break; 262 | 263 | if (cdb_checkoff(db, CDBHASH64(rec->key, rec->ksize), rec->ooff, CDB_NOTLOCKED)) { 264 | ret = itcb(arg, rec->key, rec->ksize, rec->val, rec->vsize, rec->expire, rec->oid); 265 | cnt++; 266 | } 267 | if (rec != (CDBREC *)sbuf) 268 | free(rec); 269 | if (!ret) 270 | break; 271 | } 272 | return cnt; 273 | } 274 | 275 | 276 | 277 | /* destroy the iterator */ 278 | void cdb_iterate_destroy(CDB *db, void *iter) 279 | { 280 | db->vio->recitdestroy(db->vio, iter); 281 | } 282 | 283 | 284 | /* difficult to implement */ 285 | /* 286 | static void _cdb_rcachewarmup(CDB *db) 287 | { 288 | } 289 | */ 290 | 291 | 292 | CDB *cdb_new() 293 | { 294 | CDB *db; 295 | db = (CDB *)malloc(sizeof(CDB)); 296 | /* I assume all operation in this layer is 'fast', so no mutex used here */ 297 | for(int i = 0; i < MLOCKNUM; i++) 298 | db->mlock[i] = cdb_lock_new(CDB_LOCKSPIN); 299 | db->dpclock = cdb_lock_new(CDB_LOCKSPIN); 300 | db->pclock = cdb_lock_new(CDB_LOCKSPIN); 301 | db->rclock = cdb_lock_new(CDB_LOCKSPIN); 302 | db->stlock = cdb_lock_new(CDB_LOCKSPIN); 303 | db->oidlock = cdb_lock_new(CDB_LOCKSPIN); 304 | db->bflock = cdb_lock_new(CDB_LOCKSPIN); 305 | db->bgtask = cdb_bgtask_new(); 306 | /* every thread should has its own errno */ 307 | db->errkey = (pthread_key_t *)malloc(sizeof(pthread_key_t)); 308 | pthread_key_create(db->errkey, NULL); 309 | /* set default parameter */ 310 | _cdb_defparam(db); 311 | return db; 312 | } 313 | 314 | 315 | int cdb_option(CDB *db, int bnum, int rcacheMB, int pcacheMB) 316 | { 317 | /* too small bnum is not allowed */ 318 | db->hsize = bnum > 4096? bnum : 4096; 319 | 320 | if (rcacheMB >= 0) 321 | db->rclimit = (uint64_t)rcacheMB * MB; 322 | if (pcacheMB >= 0) 323 | db->pclimit = (uint64_t)pcacheMB * MB; 324 | return 0; 325 | } 326 | 327 | 328 | void cdb_option_bloomfilter(CDB *db, uint64_t size) 329 | { 330 | db->bfsize = size; 331 | } 332 | 333 | void cdb_option_areadsize(CDB *db, uint32_t size) 334 | { 335 | db->areadsize = size; 336 | if (db->areadsize < 1 * KB) 337 | db->areadsize = 1 * KB; 338 | 339 | if (db->areadsize > SBUFSIZE - (sizeof(CDBREC) - RECHSIZE)) 340 | db->areadsize = SBUFSIZE - (sizeof(CDBREC) - RECHSIZE); 341 | } 342 | 343 | int cdb_open(CDB *db, const char *file_name, int mode) 344 | { 345 | /* if will become into a hash table when file_name == CDB_MEMDB */ 346 | int memdb = (strcmp(file_name, CDB_MEMDB) == 0); 347 | 348 | if (db->rclimit) 349 | /* record cache is enabled */ 350 | db->rcache = cdb_ht_new(true, NULL); 351 | else if (memdb) { 352 | /* record cache is disabled, but in MEMDB mode */ 353 | cdb_seterrno(db, CDB_MEMDBNOCACHE, __FILE__, __LINE__); 354 | goto ERRRET; 355 | } 356 | 357 | if (db->pclimit && !memdb) { 358 | /* page cache enabled. page cache is meaningless under MEMDB mode */ 359 | db->dpcache = cdb_ht_new(true, _pagehash); 360 | db->pcache = cdb_ht_new(true, _pagehash); 361 | } 362 | 363 | 364 | if (!memdb) { 365 | if (db->bfsize) { 366 | /* bloom filter enabled */ 367 | db->bf = cdb_bf_new(db->bfsize, db->bfsize); 368 | } 369 | /* now only one storage format is supported */ 370 | db->vio = cdb_vio_new(CDBVIOAPND2); 371 | db->vio->db = db; 372 | if (db->vio->open(db->vio, file_name, mode) < 0) 373 | goto ERRRET; 374 | if (db->vio->rhead(db->vio) < 0) { 375 | db->mtable = (FOFF*)malloc(sizeof(FOFF) * db->hsize); 376 | memset(db->mtable, 0, sizeof(FOFF) * db->hsize); 377 | } 378 | /* dirty index page would be swap to disk by timer control */ 379 | cdb_bgtask_add(db->bgtask, _cdb_flushdpagetask, db, 1); 380 | db->ndpltime = time(NULL); 381 | /* start background task thread */ 382 | cdb_bgtask_start(db->bgtask); 383 | } else { 384 | /* no persistent storage under MEMDB mode */ 385 | db->vio = NULL; 386 | db->bgtask = NULL; 387 | db->mtable = NULL; 388 | } 389 | 390 | if (db->bf || ((mode & CDB_PAGEWARMUP) && db->pcache)) { 391 | /* fill the bloom filter if it is enabled, and fill the page cache */ 392 | _cdb_pagewarmup(db, !!db->bf); 393 | } 394 | 395 | /* reset the statistic info */ 396 | cdb_stat(db, NULL); 397 | db->opened = true; 398 | return 0; 399 | 400 | ERRRET: 401 | if (db->rcache) 402 | cdb_ht_destroy(db->rcache); 403 | if (db->pcache) 404 | cdb_ht_destroy(db->pcache); 405 | if (db->dpcache) 406 | cdb_ht_destroy(db->dpcache); 407 | if (db->bf) 408 | cdb_bf_destroy(db->bf); 409 | cdb_bgtask_stop(db->bgtask); 410 | _cdb_defparam(db); 411 | return -1; 412 | } 413 | 414 | 415 | /* check if the page cache size exceed the limit. clean oldest page if necessary */ 416 | static void _cdb_pageout(CDB *db) 417 | { 418 | while (PCOVERFLOW(db)) { 419 | if (db->pcache->num) { 420 | /* clean page cache is prior */ 421 | cdb_lock_lock(db->pclock); 422 | cdb_ht_removetail(db->pcache); 423 | cdb_lock_unlock(db->pclock); 424 | } else if (db->dpcache->num) { 425 | CDBHTITEM *item; 426 | uint32_t bid; 427 | FOFF off; 428 | cdb_lock_lock(db->dpclock); 429 | item = cdb_ht_gettail(db->dpcache); 430 | if (item == NULL) { 431 | cdb_lock_unlock(db->dpclock); 432 | break; 433 | } 434 | 435 | bid = *(uint32_t*)cdb_ht_itemkey(db->dpcache, item); 436 | /* must lock the main table inside the dpclock protection */ 437 | if (cdb_lock_trylock(db->mlock[bid % MLOCKNUM]) < 0) { 438 | /* avoid dead lock since dpclock is holding */ 439 | cdb_lock_unlock(db->dpclock); 440 | /* do nothing this time */ 441 | break; 442 | } 443 | cdb_ht_poptail(db->dpcache); 444 | cdb_lock_unlock(db->dpclock); 445 | 446 | /* write out dirty page */ 447 | struct timespec ts; 448 | _cdb_timerreset(&ts); 449 | db->vio->wpage(db->vio, (CDBPAGE*)cdb_ht_itemval(db->dpcache, item), &off); 450 | db->wcount++; 451 | db->wtime += _cdb_timermicrosec(&ts); 452 | db->mtable[bid] = off; 453 | cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 454 | free(item); 455 | } 456 | } 457 | } 458 | 459 | 460 | /* check if the record cache size exceed the limit. clean oldest record if necessary */ 461 | static void _cdb_recout(CDB *db) 462 | { 463 | while (RCOVERFLOW(db)) { 464 | cdb_lock_lock(db->rclock); 465 | if (db->rcache->num) 466 | cdb_ht_removetail(db->rcache); 467 | cdb_lock_unlock(db->rclock); 468 | } 469 | } 470 | 471 | 472 | /* get all offsets from index(page) by key, even if only one of them at most is valid. 473 | Others are due to the hash collision */ 474 | int cdb_getoff(CDB *db, uint64_t hash, FOFF **offs, int locked) 475 | { 476 | char sbuf[SBUFSIZE]; 477 | CDBPAGE *page = NULL; 478 | int rnum; 479 | bool incache = true; 480 | uint32_t bid = (hash >> 24) % db->hsize; 481 | PHASH phash; 482 | 483 | phash.i1 = hash & 0xff; 484 | phash.i2 = (hash >> 8) & 0xffff; 485 | 486 | if (db->bf) { 487 | uint64_t bfkey = (bid << 24) | (hash & 0xffffff); 488 | /* check the key-hash in bloom filter? return now if not exist */ 489 | cdb_lock_lock(db->bflock); 490 | if (!cdb_bf_exist(db->bf, &bfkey, SI8)) { 491 | cdb_lock_unlock(db->bflock); 492 | return 0; 493 | } 494 | cdb_lock_unlock(db->bflock); 495 | } 496 | 497 | if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); 498 | /* page exists in clean page cache? */ 499 | if (db->pcache) { 500 | cdb_lock_lock(db->pclock); 501 | page = cdb_ht_get2(db->pcache, &bid, SI4, true); 502 | cdb_lock_unlock(db->pclock); 503 | } 504 | 505 | /* not in pcache, exists in dirty page cache? */ 506 | if (page == NULL && db->dpcache) { 507 | cdb_lock_lock(db->dpclock); 508 | page = cdb_ht_get2(db->dpcache, &bid, SI4, true); 509 | cdb_lock_unlock(db->dpclock); 510 | } 511 | 512 | if (page == NULL) { 513 | /* not in dpcache either, read from disk */ 514 | incache = false; 515 | db->pcmiss++; 516 | /* page stays in stack by default */ 517 | page = (CDBPAGE *)sbuf; 518 | if (OFFNOTNULL(db->mtable[bid])) { 519 | /* page offset not null in main table */ 520 | int ret; 521 | struct timespec ts; 522 | _cdb_timerreset(&ts); 523 | ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); 524 | db->rcount++; 525 | db->rtime += _cdb_timermicrosec(&ts); 526 | 527 | /* read page error, return */ 528 | if (ret < 0) { 529 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 530 | if (page != (CDBPAGE *)sbuf) 531 | free(page); 532 | return -1; 533 | } 534 | } else { 535 | /* no page in this bucket */ 536 | page->cap = page->num = 0; 537 | page->osize = 0; 538 | OFFZERO(page->ooff); 539 | } 540 | } else { 541 | db->pchit++; 542 | } 543 | 544 | rnum = 0; 545 | for(uint32_t i = 0; i < page->num; i++) { 546 | /* compare every hash in the page */ 547 | if (PHASHEQ(page->items[i].hash, phash)) { 548 | (*offs)[rnum] = page->items[i].off; 549 | /* result offset list stays in stack by default. Allocate one in heap if 550 | it exceeds the limit */ 551 | if (++rnum == SFOFFNUM) { 552 | /* very little possibility goes here */ 553 | FOFF *tmp = (FOFF*)malloc((page->num - i + SFOFFNUM + 1) * sizeof(FOFF)); 554 | memcpy(tmp, *offs, SFOFFNUM * sizeof(FOFF)); 555 | *offs = tmp; 556 | } 557 | } 558 | } 559 | 560 | if (!incache) { 561 | /* set into clean page cache if not exists before */ 562 | if (db->pcache) { 563 | cdb_lock_lock(db->pclock); 564 | cdb_ht_insert2(db->pcache, &bid, SI4, page, MPAGESIZE(page)); 565 | cdb_lock_unlock(db->pclock); 566 | } 567 | /* if page now points to heap memory, free it */ 568 | if (page != (CDBPAGE *)sbuf) { 569 | free(page); 570 | } 571 | } 572 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 573 | 574 | /* check page cache overflow */ 575 | if (PCOVERFLOW(db)) 576 | _cdb_pageout(db); 577 | 578 | return rnum; 579 | } 580 | 581 | 582 | /* replace a specified record's offset, may be used at disk space recycling 583 | off indicates its previous offset, noff is the new offset. return negative if not found */ 584 | int cdb_replaceoff(CDB *db, uint64_t hash, FOFF off, FOFF noff, int locked) 585 | { 586 | char sbuf[SBUFSIZE]; 587 | CDBPAGE *page = NULL; 588 | CDBHTITEM *pitem = NULL; 589 | bool indpcache = false; 590 | uint32_t bid = (hash >> 24) % db->hsize; 591 | PHASH phash; 592 | bool found = false; 593 | 594 | phash.i1 = hash & 0xff; 595 | phash.i2 = (hash >> 8) & 0xffff; 596 | 597 | if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); 598 | if (db->pcache) { 599 | /* in clean page cache, since it would be modified, it should be deleted from pcache */ 600 | cdb_lock_lock(db->pclock); 601 | pitem = cdb_ht_del(db->pcache, &bid, SI4); 602 | cdb_lock_unlock(db->pclock); 603 | if (pitem) 604 | page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem); 605 | } 606 | if (page == NULL && db->dpcache) { 607 | /* not in pcache, but in dirty page cache */ 608 | cdb_lock_lock(db->dpclock); 609 | page = cdb_ht_get2(db->dpcache, &bid, SI4, true); 610 | cdb_lock_unlock(db->dpclock); 611 | if (page) 612 | indpcache = true; 613 | } 614 | if (page == NULL) { 615 | /* not exists either, read from disk */ 616 | db->pcmiss++; 617 | page = (CDBPAGE *)sbuf; 618 | if (OFFNOTNULL(db->mtable[bid])) { 619 | int ret; 620 | struct timespec ts; 621 | _cdb_timerreset(&ts); 622 | ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); 623 | db->rcount++; 624 | db->rtime += _cdb_timermicrosec(&ts); 625 | 626 | if (ret < 0) { 627 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 628 | if (page != (CDBPAGE *)sbuf) 629 | free(page); 630 | return -1; 631 | } 632 | } else { 633 | /* nullified the empty page */ 634 | page->cap = page->num = 0; 635 | page->osize = 0; 636 | OFFZERO(page->ooff); 637 | } 638 | } else { 639 | db->pchit++; 640 | } 641 | 642 | /* check and modify */ 643 | for(uint32_t i = 0; i < page->num; i++) { 644 | if (PHASHEQ(page->items[i].hash, phash) 645 | && OFFEQ(page->items[i].off, off)) { 646 | page->items[i].off = noff; 647 | found = true; 648 | break; 649 | } 650 | } 651 | 652 | if (db->dpcache && !indpcache) { 653 | /* if page already dirty in cache, need not do anything */ 654 | /* dirty page cache is enabled but not exists before */ 655 | if (pitem) { 656 | /* pitem not NULL indicates it belongs to pcache */ 657 | if (found) { 658 | /* modified page */ 659 | cdb_lock_lock(db->dpclock); 660 | cdb_ht_insert(db->dpcache, pitem); 661 | cdb_lock_unlock(db->dpclock); 662 | } else { 663 | /* got from pcache, but not modified */ 664 | cdb_lock_lock(db->pclock); 665 | cdb_ht_insert(db->pcache, pitem); 666 | cdb_lock_unlock(db->pclock); 667 | } 668 | /* page belongs to memory in 'cache', must not free */ 669 | } else if (page != NULL) { 670 | /* page read from disk, but not in cache */ 671 | cdb_lock_lock(db->dpclock); 672 | cdb_ht_insert2(db->dpcache, &bid, SI4, page, MPAGESIZE(page)); 673 | cdb_lock_unlock(db->dpclock); 674 | /* the 'page' won't be use anymore */ 675 | if (page != (CDBPAGE *)sbuf) 676 | free(page); 677 | } 678 | } else if (!db->dpcache){ 679 | /* no page cache. Write out dirty page immediately */ 680 | FOFF poff; 681 | struct timespec ts; 682 | _cdb_timerreset(&ts); 683 | db->vio->wpage(db->vio, page, &poff); 684 | db->wcount++; 685 | db->wtime += _cdb_timermicrosec(&ts); 686 | 687 | db->mtable[bid] = poff; 688 | if (page != (CDBPAGE *)sbuf) 689 | free(page); 690 | } 691 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 692 | 693 | /* check page cache overflow */ 694 | if (PCOVERFLOW(db)) 695 | _cdb_pageout(db); 696 | 697 | return 0; 698 | } 699 | 700 | 701 | /* insert/delete a key-offset pair from index page */ 702 | int cdb_updatepage(CDB *db, uint64_t hash, FOFF off, int opt, int locked) 703 | { 704 | char sbuf[SBUFSIZE], sbuf2[SBUFSIZE]; 705 | CDBPAGE *page = NULL, *npage = NULL; 706 | CDBHTITEM *pitem = NULL, *nitem = NULL; 707 | CDBHASHTABLE *tmpcache = NULL; 708 | CDBLOCK *tmpclock = NULL; 709 | int npsize = 0; 710 | uint32_t bid = (hash >> 24) % db->hsize; 711 | PHASH phash; 712 | 713 | phash.i1 = hash & 0xff; 714 | phash.i2 = (hash >> 8) & 0xffff; 715 | 716 | if (locked == CDB_NOTLOCKED) cdb_lock_lock(db->mlock[bid % MLOCKNUM]); 717 | /* firstly, try move the page out of the cache if possible, 718 | it assumes that the page would be modified(pair exists) */ 719 | if (db->pcache) { 720 | /* try clean page cache */ 721 | cdb_lock_lock(db->pclock); 722 | pitem = cdb_ht_del(db->pcache, &bid, SI4); 723 | cdb_lock_unlock(db->pclock); 724 | if (pitem) { 725 | page = (CDBPAGE *)cdb_ht_itemval(db->pcache, pitem); 726 | tmpcache = db->pcache; 727 | tmpclock = db->pclock; 728 | } 729 | } 730 | if (page == NULL && db->dpcache) { 731 | /* try dirty page cache */ 732 | cdb_lock_lock(db->dpclock); 733 | pitem = cdb_ht_del(db->dpcache, &bid, SI4); 734 | cdb_lock_unlock(db->dpclock); 735 | if (pitem) { 736 | page = (CDBPAGE *)cdb_ht_itemval(db->dpcache, pitem); 737 | tmpcache = db->dpcache; 738 | tmpclock = db->dpclock; 739 | } 740 | } 741 | 742 | if (page == NULL) { 743 | db->pcmiss++; 744 | page = (CDBPAGE *)sbuf; 745 | /* doesn't exist in cache, read from disk */ 746 | if (OFFNOTNULL(db->mtable[bid])) { 747 | int ret; 748 | struct timespec ts; 749 | _cdb_timerreset(&ts); 750 | ret = db->vio->rpage(db->vio, &page, db->mtable[bid]); 751 | db->rcount++; 752 | db->rtime += _cdb_timermicrosec(&ts); 753 | 754 | if (ret < 0) { 755 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 756 | if (page != (CDBPAGE *)sbuf) 757 | free(page); 758 | return -1; 759 | } 760 | } else { 761 | page->cap = 0; 762 | page->num = 0; 763 | page->osize = 0; 764 | OFFZERO(page->ooff); 765 | } 766 | } else { 767 | db->pchit++; 768 | } 769 | 770 | npsize = MPAGESIZE(page); 771 | 772 | if (opt == CDB_PAGEDELETEOFF) 773 | ;// npsize = MPAGESIZE(page) - sizeof(PITEM); 774 | /* do not malloc new page on deletion */ 775 | 776 | else if (opt == CDB_PAGEINSERTOFF && page->cap == page->num) { 777 | /* get a new page, from dirty page cache if possible */ 778 | npsize = MPAGESIZE(page) + CDB_PAGEINCR * sizeof(PITEM); 779 | if (db->dpcache) { 780 | nitem = cdb_ht_newitem(db->dpcache, SI4, npsize); 781 | *(uint32_t*)cdb_ht_itemkey(db->dpcache, nitem) = bid; 782 | npage = (CDBPAGE *)cdb_ht_itemval(db->dpcache, nitem); 783 | } else { 784 | /* no dpcache, use stack if size fits */ 785 | if (npsize > SBUFSIZE) 786 | npage = (CDBPAGE *)malloc(npsize); 787 | else 788 | npage = (CDBPAGE *)sbuf2; 789 | } 790 | 791 | /* initialize the new page */ 792 | 793 | npage->bid = bid; 794 | npage->oid = cdb_genoid(db); 795 | npage->osize = page->osize; 796 | npage->ooff = page->ooff; 797 | npage->mtime = time(NULL); 798 | npage->cap = page->cap + CDB_PAGEINCR; 799 | npage->num = page->num; 800 | memcpy(npage->items, page->items, page->num * sizeof(PITEM)); 801 | /* old page got from cache */ 802 | if (pitem) 803 | free(pitem); 804 | /* old page read from disk, if in stack? */ 805 | else if (page != (CDBPAGE *)sbuf) 806 | free(page); 807 | 808 | page = npage; 809 | pitem = nitem; 810 | } 811 | 812 | uint32_t onum = page->num; 813 | 814 | if (opt == CDB_PAGEDELETEOFF) { 815 | bool found = false; 816 | for(uint32_t i = 0; i < page->num; i++) { 817 | if (!found) { 818 | if (PHASHEQ(page->items[i].hash, phash) 819 | && OFFEQ(page->items[i].off, off)) 820 | { 821 | found = true; 822 | /* records num is consistant with index */ 823 | cdb_lock_lock(db->stlock); 824 | db->rnum--; 825 | cdb_lock_unlock(db->stlock); 826 | } 827 | } 828 | if (found && i + 1 < page->num) 829 | page->items[i] = page->items[i+1]; 830 | } 831 | if (found) 832 | page->num--; 833 | } else if (opt == CDB_PAGEINSERTOFF) { 834 | bool found = false; 835 | /* check already exist? */ 836 | for(uint32_t i = 0; i < page->num; i++) { 837 | if (PHASHEQ(page->items[i].hash, phash) 838 | && OFFEQ(page->items[i].off, off)) { 839 | /* avoid exceptional deduplicated item */ 840 | found = true; 841 | break; 842 | } 843 | } 844 | 845 | /* append to the tail */ 846 | if (!found) { 847 | page->items[page->num].hash = phash; 848 | page->items[page->num].off = off; 849 | page->num++; 850 | /* records num is consistant with index */ 851 | cdb_lock_lock(db->stlock); 852 | db->rnum++; 853 | cdb_lock_unlock(db->stlock); 854 | if (db->bf) { 855 | uint64_t bfkey = (((hash >> 24) % db->hsize) << 24) | (hash & 0xffffff); 856 | cdb_lock_lock(db->bflock); 857 | cdb_bf_set(db->bf, &bfkey, SI8); 858 | cdb_lock_unlock(db->bflock); 859 | } 860 | } 861 | } 862 | 863 | if (page->num == onum) { 864 | /* nothing done */ 865 | if (pitem) { 866 | /* insert the item back to the cache where it belongs */ 867 | cdb_lock_lock(tmpclock); 868 | cdb_ht_insert(tmpcache, pitem); 869 | cdb_lock_unlock(tmpclock); 870 | } else { 871 | if (page != (CDBPAGE *)sbuf2 872 | && page != (CDBPAGE *)sbuf) 873 | free(page); 874 | } 875 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 876 | return -1; 877 | } else { 878 | if (pitem) { 879 | cdb_lock_lock(db->dpclock); 880 | cdb_ht_insert(db->dpcache, pitem); 881 | cdb_lock_unlock(db->dpclock); 882 | } else { 883 | struct timespec ts; 884 | _cdb_timerreset(&ts); 885 | db->vio->wpage(db->vio, page, &off); 886 | db->wcount++; 887 | db->wtime += _cdb_timermicrosec(&ts); 888 | 889 | db->mtable[bid] = off; 890 | if (page != (CDBPAGE *)sbuf2 891 | && page != (CDBPAGE *)sbuf) 892 | free(page); 893 | } 894 | } 895 | 896 | if (locked == CDB_NOTLOCKED) cdb_lock_unlock(db->mlock[bid % MLOCKNUM]); 897 | 898 | /* check page cache overflow */ 899 | if (PCOVERFLOW(db)) 900 | _cdb_pageout(db); 901 | 902 | return 0; 903 | } 904 | 905 | 906 | /* check if an record with specified key-offset exists in index */ 907 | bool cdb_checkoff(CDB *db, uint64_t hash, FOFF off, int locked) 908 | { 909 | FOFF soffs[SFOFFNUM]; 910 | FOFF *soff = (FOFF *)soffs; 911 | int dupnum; 912 | int ret = false; 913 | 914 | /* get all possible offsets */ 915 | dupnum = cdb_getoff(db, hash, &soff, locked); 916 | for(int i = 0; i < dupnum; i++) { 917 | if (OFFEQ(soff[i], off)) { 918 | ret = true; 919 | break; 920 | } 921 | } 922 | 923 | if (soff != (FOFF *)soffs) { 924 | free(soff); 925 | } 926 | 927 | return ret; 928 | } 929 | 930 | 931 | /* wrapper and simplified of set operation */ 932 | int cdb_set(CDB *db, const char *key, int ksize, const char *val, int vsize) 933 | { 934 | return cdb_set2(db, key, ksize, val, vsize, CDB_OVERWRITE, 0); 935 | } 936 | 937 | 938 | int cdb_set2(CDB *db, const char *key, int ksize, const char *val, int vsize, int opt, int expire) 939 | { 940 | CDBREC rec; 941 | FOFF ooff, noff; 942 | uint32_t now = time(NULL); 943 | uint64_t hash; 944 | uint32_t lockid; 945 | bool expired = false; 946 | 947 | if (db->vio == NULL) { 948 | /* if it is a memdb, just operate on the record cache and return */ 949 | cdb_lock_lock(db->rclock); 950 | cdb_ht_insert2(db->rcache, key, ksize, val, vsize); 951 | cdb_lock_unlock(db->rclock); 952 | if (RCOVERFLOW(db)) 953 | _cdb_recout(db); 954 | return 0; 955 | } 956 | 957 | hash = CDBHASH64(key, ksize); 958 | lockid = (hash >> 24) % db->hsize % MLOCKNUM; 959 | OFFZERO(rec.ooff); 960 | OFFZERO(ooff); 961 | rec.osize = 0; 962 | rec.key = (char*)key; 963 | rec.val = (char*)val; 964 | rec.ksize = ksize; 965 | rec.vsize = vsize; 966 | rec.oid = cdb_genoid(db); 967 | rec.expire = expire? now + expire : 0; 968 | 969 | cdb_lock_lock(db->mlock[lockid]); 970 | if (db->rcache) { 971 | /* if record already exists, get its old meta info */ 972 | int item_vsize; 973 | char *cval; 974 | uint32_t old_expire = 0; 975 | cdb_lock_lock(db->rclock); 976 | cval = cdb_ht_get(db->rcache, key, ksize, &item_vsize, false); 977 | if (cval) { 978 | /* record already exists */ 979 | ooff = rec.ooff = *(FOFF*)cval; 980 | rec.osize = item_vsize - SFOFF - SI4; 981 | old_expire = *(uint32_t*)(cval + SFOFF); 982 | } 983 | cdb_lock_unlock(db->rclock); 984 | if (old_expire && old_expire <= now) 985 | /* once exist but expired? */ 986 | expired = true; 987 | } 988 | 989 | if (OFFNULL(ooff)) { 990 | FOFF soffs[SFOFFNUM]; 991 | FOFF *soff = soffs; 992 | char sbuf[SBUFSIZE]; 993 | CDBREC *rrec = (CDBREC*)sbuf; 994 | 995 | int retnum; 996 | if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) { 997 | cdb_lock_unlock(db->mlock[lockid]); 998 | return -1; 999 | } 1000 | 1001 | for(int i = 0; i < retnum; i++) { 1002 | /* check for duplicate records/older version*/ 1003 | int cret; 1004 | if (rrec != (CDBREC*)sbuf) { 1005 | free(rrec); 1006 | rrec = (CDBREC*)sbuf; 1007 | } 1008 | 1009 | struct timespec ts; 1010 | _cdb_timerreset(&ts); 1011 | cret = db->vio->rrec(db->vio, &rrec, soff[i], false); 1012 | db->rcount++; 1013 | db->rtime += _cdb_timermicrosec(&ts); 1014 | 1015 | if (cret < 0) 1016 | continue; 1017 | 1018 | if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) { 1019 | /* got its old meta info */ 1020 | rec.osize = rrec->osize; 1021 | rec.ooff = rrec->ooff; 1022 | ooff = rec.ooff; 1023 | if (rrec->expire <= now) 1024 | expired = true; 1025 | break; 1026 | } 1027 | } 1028 | if (soff != soffs) 1029 | free(soff); 1030 | if (rrec != (CDBREC*)sbuf) 1031 | free(rrec); 1032 | } 1033 | 1034 | if (OFFNOTNULL(ooff) && !expired) { 1035 | /* record already exists*/ 1036 | if (opt & CDB_INSERTIFNOEXIST) { 1037 | cdb_lock_unlock(db->mlock[lockid]); 1038 | cdb_seterrno(db, CDB_EXIST, __FILE__, __LINE__); 1039 | return -2; 1040 | } 1041 | } else { 1042 | if (opt & CDB_INSERTIFEXIST) { 1043 | cdb_lock_unlock(db->mlock[lockid]); 1044 | cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); 1045 | return -3; 1046 | } 1047 | } 1048 | 1049 | struct timespec ts; 1050 | _cdb_timerreset(&ts); 1051 | if (db->vio->wrec(db->vio, &rec, &noff) < 0) { 1052 | cdb_lock_unlock(db->mlock[lockid]); 1053 | return -1; 1054 | } 1055 | db->wcount++; 1056 | db->wtime += _cdb_timermicrosec(&ts); 1057 | 1058 | if (OFFNOTNULL(ooff)) { 1059 | cdb_replaceoff(db, hash, ooff, noff, CDB_LOCKED); 1060 | } else { 1061 | cdb_updatepage(db, hash, noff, CDB_PAGEINSERTOFF, CDB_LOCKED); 1062 | } 1063 | 1064 | if (db->rcache) { 1065 | if ((opt & CDB_INSERTCACHE) == CDB_INSERTCACHE) { 1066 | char *cval; 1067 | CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, vsize + SI4 + SFOFF); 1068 | memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize); 1069 | cval = cdb_ht_itemval(db->rcache, item); 1070 | memcpy(cval + SI4 + SFOFF, val, vsize); 1071 | *(FOFF*)(cval) = rec.ooff; 1072 | *(uint32_t*)(cval + SFOFF) = rec.expire; 1073 | cdb_lock_lock(db->rclock); 1074 | cdb_ht_insert(db->rcache, item); 1075 | cdb_lock_unlock(db->rclock); 1076 | } 1077 | } 1078 | cdb_lock_unlock(db->mlock[lockid]); 1079 | 1080 | if (RCOVERFLOW(db)) 1081 | _cdb_recout(db); 1082 | 1083 | cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); 1084 | return 0; 1085 | } 1086 | 1087 | 1088 | 1089 | int cdb_get(CDB *db, const char *key, int ksize, void **val, int *vsize) 1090 | { 1091 | char sbuf[SBUFSIZE]; 1092 | CDBREC *rec = (CDBREC *)sbuf; 1093 | FOFF soffs[SFOFFNUM]; 1094 | FOFF *offs; 1095 | int dupnum, ret = -3; 1096 | uint64_t hash; 1097 | uint32_t now = time(NULL); 1098 | uint32_t lockid; 1099 | 1100 | *vsize = 0; 1101 | *val = NULL; 1102 | if (db->rcache) { 1103 | char *cval; 1104 | cdb_lock_lock(db->rclock); 1105 | cval = cdb_ht_get(db->rcache, key, ksize, vsize, true); 1106 | if (cval) { 1107 | db->rchit++; 1108 | if (db->vio) { 1109 | (*vsize) -= SI4 + SFOFF; 1110 | if (*(uint32_t*)(cval + SFOFF) 1111 | && *(uint32_t*)(cval + SFOFF) <= now) { 1112 | cdb_lock_unlock(db->rclock); 1113 | /* not found no not report error now */ 1114 | //cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); 1115 | return -3; 1116 | } 1117 | cval = (void*)(cval + SI4 + SFOFF); 1118 | } 1119 | *val = malloc(*vsize); 1120 | memcpy(*val, cval, *vsize); 1121 | cdb_lock_unlock(db->rclock); 1122 | return 0; 1123 | } else { 1124 | db->rcmiss++; 1125 | if (db->vio == NULL) { 1126 | cdb_lock_unlock(db->rclock); 1127 | return -3; 1128 | } 1129 | } 1130 | cdb_lock_unlock(db->rclock); 1131 | } 1132 | 1133 | offs = soffs; 1134 | hash = CDBHASH64(key, ksize); 1135 | lockid = (hash >> 24) % db->hsize % MLOCKNUM; 1136 | cdb_lock_lock(db->mlock[lockid]); 1137 | dupnum = cdb_getoff(db, hash, &offs, CDB_LOCKED); 1138 | if (dupnum < 0) { 1139 | cdb_lock_unlock(db->mlock[lockid]); 1140 | return -1; 1141 | } 1142 | 1143 | for(int i = 0; i < dupnum; i++) { 1144 | int cret; 1145 | if (rec != (CDBREC*)sbuf) { 1146 | free(rec); 1147 | rec = (CDBREC*)sbuf; 1148 | } 1149 | 1150 | struct timespec ts; 1151 | _cdb_timerreset(&ts); 1152 | cret = db->vio->rrec(db->vio, &rec, offs[i], true); 1153 | db->rcount++; 1154 | db->rtime += _cdb_timermicrosec(&ts); 1155 | 1156 | if (cret < 0) 1157 | continue; 1158 | 1159 | if (ksize == rec->ksize && memcmp(rec->key, key, ksize) == 0) { 1160 | if (rec->expire && rec->expire <= now) { 1161 | break; 1162 | } 1163 | *vsize = rec->vsize; 1164 | *val = malloc(*vsize); 1165 | memcpy(*val, rec->val, *vsize); 1166 | ret = 0; 1167 | break; 1168 | } 1169 | } 1170 | 1171 | if (ret == 0 && db->rcache) { 1172 | char *cval; 1173 | CDBHTITEM *item = cdb_ht_newitem(db->rcache, ksize, *vsize + SI4 + SFOFF); 1174 | memcpy(cdb_ht_itemkey(db->rcache, item), key, ksize); 1175 | cval = cdb_ht_itemval(db->rcache, item); 1176 | memcpy(cval + SI4 + SFOFF, *val, *vsize); 1177 | *(FOFF*)(cval) = rec->ooff; 1178 | *(uint32_t*)(cval + SFOFF) = rec->expire; 1179 | cdb_lock_lock(db->rclock); 1180 | cdb_ht_insert(db->rcache, item); 1181 | cdb_lock_unlock(db->rclock); 1182 | } 1183 | cdb_lock_unlock(db->mlock[lockid]); 1184 | 1185 | if (RCOVERFLOW(db)) 1186 | _cdb_recout(db); 1187 | 1188 | if (offs != soffs) 1189 | free(offs); 1190 | 1191 | if (rec != (CDBREC*)sbuf) 1192 | free(rec); 1193 | 1194 | if (ret < 0) 1195 | cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); 1196 | else { 1197 | db->rcmiss++; 1198 | cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); 1199 | } 1200 | return ret; 1201 | } 1202 | 1203 | 1204 | void cdb_free_val(void **val) 1205 | { 1206 | if (*val) 1207 | free(*val); 1208 | *val = NULL; 1209 | } 1210 | 1211 | 1212 | int cdb_del(CDB *db, const char *key, int ksize) 1213 | { 1214 | FOFF ooff; 1215 | CDBREC rec; 1216 | uint32_t lockid; 1217 | uint64_t hash; 1218 | 1219 | OFFZERO(rec.ooff); 1220 | OFFZERO(ooff); 1221 | rec.osize = 0; 1222 | rec.key = (char*)key; 1223 | rec.ksize = ksize; 1224 | rec.val = NULL; 1225 | rec.vsize = 0; 1226 | 1227 | if (db->vio == NULL) { 1228 | /* if it is a memdb, just operate on the record cache and return */ 1229 | cdb_lock_lock(db->rclock); 1230 | cdb_ht_del2(db->rcache, key, ksize); 1231 | cdb_lock_unlock(db->rclock); 1232 | if (RCOVERFLOW(db)) 1233 | _cdb_recout(db); 1234 | return 0; 1235 | } 1236 | 1237 | hash = CDBHASH64(key, ksize); 1238 | lockid = (hash >> 24) % db->hsize % MLOCKNUM; 1239 | cdb_lock_lock(db->mlock[lockid]); 1240 | if (db->rcache) { 1241 | /* if record already exists, get its old meta info */ 1242 | CDBHTITEM *item; 1243 | cdb_lock_lock(db->rclock); 1244 | item = cdb_ht_del(db->rcache, key, ksize); 1245 | cdb_lock_unlock(db->rclock); 1246 | if (item) { 1247 | char *cval = cdb_ht_itemval(db->rcache, item); 1248 | ooff = rec.ooff = *(FOFF*)cval; 1249 | rec.osize = item->vsize - SFOFF - SI4; 1250 | rec.expire = *(uint32_t*)(cval + SFOFF); 1251 | free(item); 1252 | } 1253 | } 1254 | 1255 | if (OFFNULL(ooff)) { 1256 | FOFF soffs[SFOFFNUM]; 1257 | FOFF *soff = soffs; 1258 | char sbuf[SBUFSIZE]; 1259 | CDBREC *rrec = (CDBREC*)sbuf; 1260 | 1261 | int retnum; 1262 | if ((retnum = cdb_getoff(db, hash, &soff, CDB_LOCKED)) < 0) { 1263 | cdb_lock_unlock(db->mlock[lockid]); 1264 | return -1; 1265 | } 1266 | 1267 | for(int i = 0; i < retnum; i++) { 1268 | /* check for duplicate records/older version*/ 1269 | int cret; 1270 | if (rrec != (CDBREC*)sbuf) { 1271 | free(rrec); 1272 | rrec = (CDBREC*)sbuf; 1273 | } 1274 | 1275 | struct timespec ts; 1276 | _cdb_timerreset(&ts); 1277 | cret = db->vio->rrec(db->vio, &rrec, soff[i], false); 1278 | db->rcount++; 1279 | db->rtime += _cdb_timermicrosec(&ts); 1280 | 1281 | if (cret < 0) 1282 | continue; 1283 | 1284 | if (ksize == rrec->ksize && memcmp(rrec->key, key, ksize) == 0) { 1285 | /* got its old meta info */ 1286 | rec.osize = rrec->osize; 1287 | rec.ooff = rrec->ooff; 1288 | ooff = rec.ooff; 1289 | break; 1290 | } 1291 | } 1292 | if (soff != soffs) 1293 | free(soff); 1294 | if (rrec != (CDBREC*)sbuf) 1295 | free(rrec); 1296 | } 1297 | 1298 | if (OFFNOTNULL(ooff)) { 1299 | cdb_updatepage(db, hash, ooff, CDB_PAGEDELETEOFF, CDB_LOCKED); 1300 | cdb_lock_unlock(db->mlock[lockid]); 1301 | 1302 | struct timespec ts; 1303 | _cdb_timerreset(&ts); 1304 | if (db->vio->drec(db->vio, &rec, ooff) < 0) 1305 | ; // return -1; succeed or not doesn't matter 1306 | db->wcount++; 1307 | db->wtime += _cdb_timermicrosec(&ts); 1308 | cdb_seterrno(db, CDB_SUCCESS, __FILE__, __LINE__); 1309 | return 0; 1310 | } else { 1311 | cdb_lock_unlock(db->mlock[lockid]); 1312 | cdb_seterrno(db, CDB_NOTFOUND, __FILE__, __LINE__); 1313 | return -3; 1314 | } 1315 | } 1316 | 1317 | 1318 | void cdb_stat(CDB *db, CDBSTAT *stat) 1319 | { 1320 | if (stat == NULL) { 1321 | db->rchit = db->rcmiss = 0; 1322 | db->pchit = db->pcmiss = 0; 1323 | db->rcount = db->rtime = 0; 1324 | db->wcount = db->wtime = 0; 1325 | } else { 1326 | stat->rnum = db->rnum; 1327 | stat->rcnum = db->rcache? db->rcache->num : 0; 1328 | stat->pnum = db->hsize; 1329 | stat->pcnum = (db->pcache? db->pcache->num : 0) 1330 | + (db->dpcache? db->dpcache->num : 0); 1331 | stat->rchit = db->rchit; 1332 | stat->rcmiss = db->rcmiss; 1333 | stat->pchit = db->pchit; 1334 | stat->pcmiss = db->pcmiss; 1335 | stat->rlatcy = db->rcount ? db->rtime / db->rcount : 0; 1336 | stat->wlatcy = db->wcount ? db->wtime / db->wcount : 0; 1337 | } 1338 | } 1339 | 1340 | 1341 | int cdb_close(CDB *db) 1342 | { 1343 | if (!db->opened) 1344 | return -1; 1345 | 1346 | if (db->bgtask) 1347 | cdb_bgtask_stop(db->bgtask); 1348 | if (db->rcache) 1349 | cdb_ht_destroy(db->rcache); 1350 | if (db->pcache) 1351 | cdb_ht_destroy(db->pcache); 1352 | if (db->dpcache) { 1353 | cdb_flushalldpage(db); 1354 | cdb_ht_destroy(db->dpcache); 1355 | } 1356 | 1357 | if (db->vio) { 1358 | db->vio->whead(db->vio); 1359 | db->vio->close(db->vio); 1360 | cdb_vio_destroy(db->vio); 1361 | } 1362 | if (db->mtable) 1363 | free(db->mtable); 1364 | db->opened = false; 1365 | _cdb_defparam(db); 1366 | return 0; 1367 | } 1368 | 1369 | 1370 | void cdb_deferrorcb(void *arg, int errno, const char *file, int line) 1371 | { 1372 | fprintf(stderr, "DBERR: [%s:%d] %d - %s\n", file, line, errno, cdb_errmsg(errno)); 1373 | } 1374 | 1375 | 1376 | int cdb_destroy(CDB *db) 1377 | { 1378 | if (db->opened) 1379 | cdb_close(db); 1380 | for(int i = 0; i < MLOCKNUM; i++) 1381 | cdb_lock_destory(db->mlock[i]); 1382 | cdb_lock_destory(db->dpclock); 1383 | cdb_lock_destory(db->pclock); 1384 | cdb_lock_destory(db->rclock); 1385 | cdb_lock_destory(db->stlock); 1386 | cdb_lock_destory(db->oidlock); 1387 | cdb_lock_destory(db->bflock); 1388 | cdb_bgtask_destroy(db->bgtask); 1389 | pthread_key_delete(*(pthread_key_t*)db->errkey); 1390 | free(db->errkey); 1391 | free(db); 1392 | return 0; 1393 | } 1394 | 1395 | 1396 | 1397 | --------------------------------------------------------------------------------