├── NEWS ├── ChangeLog ├── README ├── AUTHORS ├── .gitignore ├── doc ├── fast15_destor.pdf ├── atc14-paper-fu_min.pdf └── README.md ├── src ├── recipe │ ├── Makefile.am │ └── recipestore.h ├── fsl │ ├── Makefile.am │ └── read_fsl_trace.c ├── chunking │ ├── Makefile.am │ ├── chunking.h │ └── ae_chunking.c ├── storage │ ├── Makefile.am │ └── containerstore.h ├── utils │ ├── Makefile.am │ ├── bloom_filter.h │ ├── sync_queue.h │ ├── queue.h │ ├── lru_cache.h │ ├── queue.c │ ├── lru_cache.c │ ├── sync_queue.c │ ├── sds.h │ ├── serial.c │ ├── serial.h │ └── bloom_filter.c ├── cma.h ├── index │ ├── Makefile.am │ ├── kvstore.h │ ├── fingerprint_cache.h │ ├── index_buffer.h │ ├── kvstore.c │ ├── index.h │ ├── fingerprint_cache.c │ ├── segmenting_method.c │ ├── kvstore_htable.c │ └── similarity_detection.c ├── Makefile.am ├── restore.h ├── hash_phase.c ├── rewrite_phase.h ├── jcr.h ├── backup.h ├── jcr.c ├── restore_aware.c ├── cfl_rewrite.c ├── read_phase.c ├── do_delete.c ├── cap_rewrite.c ├── dedup_phase.c ├── rewrite_phase.c ├── cma.c ├── trace_phase.c ├── cbr_rewrite.c ├── assembly_restore.c ├── har_rewrite.c ├── chunk_phase.c ├── do_backup.c ├── optimal_restore.c └── do_restore.c ├── rebuild ├── Makefile.am ├── scripts ├── ndpl.sh ├── edpl.sh ├── edll.sh ├── cost.sh ├── ndll_spv.sh ├── restore.sh ├── hybrid.sh ├── rewrite.sh ├── benchmark.sh ├── caf.sh ├── ndll.sh ├── cache.sh ├── interplay.sh └── destor.config ├── configure.ac ├── destor.config └── README.md /NEWS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | See README.md 2 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Author : Min Fu 2 | Email : fumin@hust.edu.cn 3 | Blog : fomy.snaapp.com 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.out 3 | *.files 4 | *.a 5 | Makefile 6 | .deps 7 | tags 8 | destor 9 | -------------------------------------------------------------------------------- /doc/fast15_destor.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fomy/destor/HEAD/doc/fast15_destor.pdf -------------------------------------------------------------------------------- /src/recipe/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=librecipe.a 2 | librecipe_a_SOURCES=recipestore.c 3 | -------------------------------------------------------------------------------- /doc/atc14-paper-fu_min.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fomy/destor/HEAD/doc/atc14-paper-fu_min.pdf -------------------------------------------------------------------------------- /src/fsl/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libfsl.a 2 | libfsl_a_SOURCES=libhashfile.c read_fsl_trace.c 3 | -------------------------------------------------------------------------------- /src/chunking/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libchunk.a 2 | libchunk_a_SOURCES=rabin_chunking.c ae_chunking.c 3 | -------------------------------------------------------------------------------- /src/storage/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libstorage.a 2 | libstorage_a_SOURCES=containerstore.c 3 | LIBS=-lglib 4 | -------------------------------------------------------------------------------- /src/utils/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libutils.a 2 | libutils_a_SOURCES=lru_cache.c sync_queue.c queue.c serial.c bloom_filter.c sds.c 3 | -------------------------------------------------------------------------------- /src/cma.h: -------------------------------------------------------------------------------- 1 | #ifndef CMA_H_ 2 | #define CMA_H_ 3 | 4 | #include "destor.h" 5 | 6 | void update_manifest(GHashTable *monitor); 7 | GHashTable* trunc_manifest(int jobid); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /src/index/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libindex.a 2 | libindex_a_SOURCES=index.c fingerprint_cache.c kvstore.c kvstore_htable.c sampling_method.c segmenting_method.c similarity_detection.c 3 | LIBS=-lglib 4 | -------------------------------------------------------------------------------- /rebuild: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | DIR=/home/data/working/ 4 | mkdir -p ${DIR}/recipes/ 5 | mkdir -p ${DIR}/index/ 6 | 7 | rm -f ${DIR}/recipes/* 8 | rm -f ${DIR}/index/* 9 | rm -f ${DIR}/container.pool 10 | rm -f ${DIR}/destor.stat 11 | rm -f ${DIR}/manifest 12 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | bin_PROGRAMS=destor 2 | destor_SOURCES= 3 | destor_LDADD=src/libdestor.a src/index/libindex.a src/storage/libstorage.a src/utils/libutils.a src/recipe/librecipe.a src/chunking/libchunk.a src/fsl/libfsl.a 4 | SUBDIRS=src src/index src/recipe src/storage src/utils src/chunking src/fsl 5 | LIBS=-lpthread -lcrypto -lglib 6 | -------------------------------------------------------------------------------- /src/Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LIBRARIES=libdestor.a 2 | libdestor_a_SOURCES=destor.c jcr.c config.c do_backup.c read_phase.c chunk_phase.c hash_phase.c trace_phase.c dedup_phase.c rewrite_phase.c filter_phase.c cfl_rewrite.c cap_rewrite.c cbr_rewrite.c har_rewrite.c restore_aware.c do_restore.c optimal_restore.c assembly_restore.c cma.c do_delete.c 3 | LIBS=-lglib 4 | -------------------------------------------------------------------------------- /src/index/kvstore.h: -------------------------------------------------------------------------------- 1 | #ifndef KVSTORE_H_ 2 | #define KVSTORE_H_ 3 | 4 | #include "../destor.h" 5 | 6 | void init_kvstore(); 7 | 8 | extern void (*close_kvstore)(); 9 | extern int64_t* (*kvstore_lookup)(char *key); 10 | extern void (*kvstore_update)(char *key, int64_t id); 11 | extern void (*kvstore_delete)(char* key, int64_t id); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/index/fingerprint_cache.h: -------------------------------------------------------------------------------- 1 | /* 2 | * fingerprint_cache.h 3 | * 4 | * Created on: Mar 24, 2014 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef FINGERPRINT_CACHE_H_ 9 | #define FINGERPRINT_CACHE_H_ 10 | 11 | void init_fingerprint_cache(); 12 | int64_t fingerprint_cache_lookup(fingerprint *fp); 13 | void fingerprint_cache_prefetch(int64_t id); 14 | 15 | #endif /* FINGERPRINT_CACHE_H_ */ 16 | -------------------------------------------------------------------------------- /src/restore.h: -------------------------------------------------------------------------------- 1 | /* 2 | * restore.h 3 | * 4 | * Created on: Nov 27, 2013 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef RESTORE_H_ 9 | #define RESTORE_H_ 10 | 11 | #include "utils/sync_queue.h" 12 | 13 | SyncQueue *restore_chunk_queue; 14 | SyncQueue *restore_recipe_queue; 15 | 16 | void* assembly_restore_thread(void *arg); 17 | void* optimal_restore_thread(void *arg); 18 | 19 | #endif /* RESTORE_H_ */ 20 | -------------------------------------------------------------------------------- /src/chunking/chunking.h: -------------------------------------------------------------------------------- 1 | /* chunking.h 2 | the main fuction is to chunking the file! 3 | */ 4 | 5 | #ifndef CHUNK_H_ 6 | #define CHUNK_H_ 7 | 8 | void windows_reset(); 9 | void chunkAlg_init(); 10 | int rabin_chunk_data(unsigned char *p, int n); 11 | int normalized_rabin_chunk_data(unsigned char *p, int n); 12 | 13 | void ae_init(); 14 | int ae_chunk_data(unsigned char *p, int n); 15 | 16 | int tttd_chunk_data(unsigned char *p, int n); 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /src/utils/bloom_filter.h: -------------------------------------------------------------------------------- 1 | #ifndef BF_H_ 2 | #define BF_H_ 3 | 4 | /* config options */ 5 | /* 2^FILTER_SIZE is the size of the filter in bits, i.e., 6 | * size 20 = 2^20 bits = 1 048 576 bits = 131 072 bytes = 128 KB */ 7 | #define FILTER_SIZE 30 8 | #define NUM_HASHES 4 9 | #define FILTER_SIZE_BYTES (1 << (FILTER_SIZE - 3)) 10 | #define FILTER_BITMASK ((1 << FILTER_SIZE) - 1) 11 | 12 | void insert_word(unsigned char[], char *, int); 13 | int in_dict(unsigned char[], char *, int); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | This directory contains documents related to Destor 2 | --------------------------------------------------- 3 | File list: 4 | 5 | 1. atc14-paper-fu_min.pdf 6 | 7 | > This paper accepted by USENIX ATC'14 describes the rewriting algorithm and garbage collection scheme in Destor. 8 | 9 | 2. fast15_destor.pdf 10 | 11 | > This paper accepted by USENIX FAST'15 describes the architecture of Destor and gives extensive experimental results. 12 | > NOTE that this is the submitted manuscript, not the camera-ready version. 13 | -------------------------------------------------------------------------------- /src/utils/sync_queue.h: -------------------------------------------------------------------------------- 1 | #ifndef SYNC_QUEUE_H_ 2 | #define SYNC_QUEUE_H_ 3 | 4 | #include 5 | #include 6 | #include "queue.h" 7 | 8 | typedef struct { 9 | int term; // terminated 10 | int max_size;/* the max size of queue */ 11 | Queue *queue; 12 | pthread_mutex_t mutex; 13 | pthread_cond_t max_work; 14 | pthread_cond_t min_work; 15 | } SyncQueue; 16 | 17 | SyncQueue* sync_queue_new(int); 18 | void sync_queue_free(SyncQueue*, void (*)(void*)); 19 | void sync_queue_push(SyncQueue*, void*); 20 | void* sync_queue_pop(SyncQueue*); 21 | void sync_queue_term(SyncQueue*); 22 | int sync_queue_size(SyncQueue* s_queue); 23 | void* sync_queue_find(SyncQueue* s_queue, int (*hit)(void*, void*), void* data, 24 | void* (*dup)(void*)); 25 | void* sync_queue_get_top(SyncQueue* s_queue); 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /src/index/index_buffer.h: -------------------------------------------------------------------------------- 1 | #ifndef INDEX_BUFFER_H_ 2 | #define INDEX_BUFFER_H_ 3 | 4 | #include "../destor.h" 5 | /* 6 | * The basic unit in index buffer. 7 | */ 8 | struct indexElem { 9 | containerid id; 10 | fingerprint fp; 11 | }; 12 | 13 | /* The buffer size > 2 * destor.rewrite_buffer_size */ 14 | /* All fingerprints that have been looked up in the index 15 | * but not been updated. */ 16 | struct index_buffer { 17 | /* map a fingerprint to a queue of indexElem */ 18 | /* Index all fingerprints in the index buffer. */ 19 | GHashTable *buffered_fingerprints; 20 | /* The number of buffered chunks */ 21 | int chunk_num; 22 | }; 23 | 24 | struct index_overhead { 25 | /* Requests to the key-value store */ 26 | int lookup_requests; 27 | int update_requests; 28 | int lookup_requests_for_unique; 29 | /* Overheads of prefetching module */ 30 | int read_prefetching_units; 31 | }; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/utils/queue.h: -------------------------------------------------------------------------------- 1 | /* 2 | * queue.h 3 | * 4 | * Created on: May 21, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef QUEUE_H_ 9 | #define QUEUE_H_ 10 | 11 | typedef struct queue_ele_tag { 12 | struct queue_ele_tag *next; 13 | void *data; 14 | } queue_ele_t; 15 | 16 | /* 17 | * Structure describing a queue 18 | */ 19 | typedef struct queue_tag { 20 | queue_ele_t *first, *last; /* work queue */ 21 | int elem_num; 22 | //int max_elem_num; //-1 means infi. 23 | } Queue; 24 | 25 | Queue* queue_new(); 26 | void queue_free(Queue *queue, void (*)(void*)); 27 | void queue_push(Queue *queue, void *element); 28 | void* queue_pop(Queue *queue); 29 | int queue_size(Queue *queue); 30 | void queue_foreach(Queue *queue, void (*func)(void *data, void *user_data), 31 | void *user_data); 32 | void* queue_get_n(Queue *queue, int n); 33 | void * queue_top(Queue *queue); 34 | void* queue_find(Queue* queue, int (*hit)(void*, void*), void* data); 35 | 36 | #endif /* QUEUE_H_ */ 37 | -------------------------------------------------------------------------------- /src/index/kvstore.c: -------------------------------------------------------------------------------- 1 | #include "kvstore.h" 2 | 3 | extern void init_kvstore_htable(); 4 | extern void close_kvstore_htable(); 5 | extern int64_t* kvstore_htable_lookup(char* key); 6 | extern void kvstore_htable_update(char* key, int64_t id); 7 | extern void kvstore_htable_delete(char* key, int64_t id); 8 | 9 | /* 10 | * Mapping a fingerprint (or feature) to the prefetching unit. 11 | */ 12 | 13 | void (*close_kvstore)(); 14 | int64_t* (*kvstore_lookup)(char *key); 15 | void (*kvstore_update)(char *key, int64_t id); 16 | void (*kvstore_delete)(char* key, int64_t id); 17 | 18 | void init_kvstore() { 19 | 20 | switch(destor.index_key_value_store){ 21 | case INDEX_KEY_VALUE_HTABLE: 22 | init_kvstore_htable(); 23 | 24 | close_kvstore = close_kvstore_htable; 25 | kvstore_lookup = kvstore_htable_lookup; 26 | kvstore_update = kvstore_htable_update; 27 | kvstore_delete = kvstore_htable_delete; 28 | 29 | break; 30 | default: 31 | WARNING("Invalid key-value store!"); 32 | exit(1); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/hash_phase.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "backup.h" 4 | 5 | static pthread_t hash_t; 6 | static int64_t chunk_num; 7 | 8 | static void* sha1_thread(void* arg) { 9 | char code[41]; 10 | while (1) { 11 | struct chunk* c = sync_queue_pop(chunk_queue); 12 | 13 | if (c == NULL) { 14 | sync_queue_term(hash_queue); 15 | break; 16 | } 17 | 18 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) { 19 | sync_queue_push(hash_queue, c); 20 | continue; 21 | } 22 | 23 | TIMER_DECLARE(1); 24 | TIMER_BEGIN(1); 25 | SHA_CTX ctx; 26 | SHA_Init(&ctx); 27 | SHA_Update(&ctx, c->data, c->size); 28 | SHA_Final(c->fp, &ctx); 29 | TIMER_END(1, jcr.hash_time); 30 | 31 | hash2code(c->fp, code); 32 | code[40] = 0; 33 | VERBOSE("Hash phase: %ldth chunk identified by %s", chunk_num++, code); 34 | 35 | sync_queue_push(hash_queue, c); 36 | } 37 | return NULL; 38 | } 39 | 40 | void start_hash_phase() { 41 | hash_queue = sync_queue_new(100); 42 | pthread_create(&hash_t, NULL, sha1_thread, NULL); 43 | } 44 | 45 | void stop_hash_phase() { 46 | pthread_join(hash_t, NULL); 47 | NOTICE("hash phase stops successfully!"); 48 | } 49 | -------------------------------------------------------------------------------- /src/utils/lru_cache.h: -------------------------------------------------------------------------------- 1 | /* 2 | * lru_cache.h 3 | * GList-based lru cache 4 | * Created on: May 23, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef Cache_H_ 9 | #define Cache_H_ 10 | #define INFI_CACHE -1 11 | 12 | #include 13 | 14 | struct lruCache { 15 | GList *elem_queue; 16 | 17 | int max_size; // less then zero means infinite cache 18 | int size; 19 | 20 | double hit_count; 21 | double miss_count; 22 | 23 | void (*free_elem)(void *); 24 | int (*hit_elem)(void* elem, void* user_data); 25 | }; 26 | 27 | struct lruCache* new_lru_cache(int size, void (*free_elem)(void *), 28 | int (*hit_elem)(void* elem, void* user_data)); 29 | void free_lru_cache(struct lruCache*); 30 | void* lru_cache_lookup(struct lruCache*, void* user_data); 31 | void* lru_cache_lookup_without_update(struct lruCache* c, void* user_data); 32 | void* lru_cache_hits(struct lruCache* c, void* user_data, 33 | int (*hit)(void* elem, void* user_data)); 34 | /* Kick the elem that makes func returning 1. */ 35 | void lru_cache_kicks(struct lruCache* c, void* user_data, 36 | int (*func)(void* elem, void* user_data)); 37 | void lru_cache_insert(struct lruCache *c, void* data, 38 | void (*victim)(void*, void*), void* user_data); 39 | int lru_cache_is_full(struct lruCache*); 40 | 41 | #endif /* Cache_H_ */ 42 | -------------------------------------------------------------------------------- /src/chunking/ae_chunking.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Author: Yucheng Zhang 3 | * See his INFOCOM paper for more details. 4 | */ 5 | 6 | #include "../destor.h" 7 | 8 | #define my_memcmp(x, y) \ 9 | ({ \ 10 | int __ret; \ 11 | uint64_t __a = __builtin_bswap64(*((uint64_t *) x)); \ 12 | uint64_t __b = __builtin_bswap64(*((uint64_t *) y)); \ 13 | if (__a > __b) \ 14 | __ret = 1; \ 15 | else \ 16 | __ret = -1; \ 17 | __ret;\ 18 | }) 19 | 20 | static int window_size = 0; 21 | 22 | /* 23 | * Calculating the window size 24 | */ 25 | void ae_init(){ 26 | double e = 2.718281828; 27 | window_size = destor.chunk_avg_size/(e-1); 28 | } 29 | 30 | /* 31 | * n is the size of string p. 32 | */ 33 | int ae_chunk_data(unsigned char *p, int n) { 34 | /* 35 | * curr points to the current position; 36 | * max points to the position of max value; 37 | * end points to the end of buffer. 38 | */ 39 | unsigned char *curr = p+1, *max = p, *end = p+n-8; 40 | 41 | if (n <= window_size + 8) 42 | return n; 43 | 44 | for (; curr <= end; curr++) { 45 | int comp_res = my_memcmp(curr, max); 46 | if (comp_res < 0) { 47 | max = curr; 48 | continue; 49 | } 50 | if (curr == max + window_size || curr == p + destor.chunk_max_size) 51 | return curr - p; 52 | } 53 | return n; 54 | } 55 | -------------------------------------------------------------------------------- /scripts/ndpl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 1 ];then 4 | echo "dataset <- $1" 5 | dataset=$1 6 | echo "sampling <- $2" 7 | sampling=$2 8 | else 9 | echo "2 parameters are required" 10 | exit 1 11 | fi 12 | 13 | kernel_path="/home/dataset/kernel_8k/" 14 | vmdk_path="/home/dataset/vmdk_4k/" 15 | rdb_path="/home/dataset/rdb_4k/" 16 | synthetic_path="/home/dataset/synthetic_8k/" 17 | 18 | # path: where trace files locate 19 | case $dataset in 20 | "kernel") 21 | path=$kernel_path 22 | ;; 23 | "vmdk") 24 | path=$vmdk_path 25 | ;; 26 | "rdb") 27 | path=$rdb_path 28 | ;; 29 | "synthetic") 30 | path=$synthetic_path 31 | ;; 32 | *) 33 | echo "Wrong dataset!" 34 | exit 1 35 | ;; 36 | esac 37 | 38 | # ./rebuild would clear data of previous experiments 39 | # ./destor executes a backup job 40 | # (results are written to backup.log) 41 | # ./destor -rN executes a restore job under various restore cache size 42 | # (results are written to restore.log) 43 | 44 | # r is the sampling Ratio 45 | for r in 1 16 32 64 128 256;do 46 | ../rebuild 47 | for file in $(ls $path);do 48 | ../destor $path/$file -p"fingerprint-index near-exact physical" -p"fingerprint-index-sampling-method $sampling $r" >> log 49 | done 50 | ../destor -s >> backup.log 51 | done 52 | -------------------------------------------------------------------------------- /src/rewrite_phase.h: -------------------------------------------------------------------------------- 1 | /* 2 | * rewrite_phase.h 3 | * 4 | * Created on: Nov 27, 2013 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef REWRITE_PHASE_H_ 9 | #define REWRITE_PHASE_H_ 10 | 11 | #include "destor.h" 12 | 13 | struct containerRecord { 14 | containerid cid; 15 | int32_t size; 16 | int32_t out_of_order; 17 | }; 18 | 19 | struct { 20 | GQueue *chunk_queue; 21 | GSequence *container_record_seq; // 22 | int num; 23 | int size; 24 | } rewrite_buffer; 25 | 26 | void* cfl_rewrite(void* arg); 27 | void* cbr_rewrite(void* arg); 28 | void* cap_rewrite(void* arg); 29 | 30 | /* har_rewrite.c */ 31 | void init_har(); 32 | void close_har(); 33 | void har_monitor_update(containerid id, int32_t size); 34 | void har_check(struct chunk* c); 35 | 36 | /* restore_aware.c */ 37 | void init_restore_aware(); 38 | void restore_aware_update(containerid id, int32_t chunklen); 39 | int restore_aware_contains(containerid id); 40 | double restore_aware_get_cfl(); 41 | 42 | /* For sorting container records. */ 43 | gint g_record_descmp_by_length(struct containerRecord* a, 44 | struct containerRecord* b, gpointer user_data); 45 | gint g_record_cmp_by_id(struct containerRecord* a, struct containerRecord* b, 46 | gpointer user_data); 47 | 48 | int rewrite_buffer_push(struct chunk* c); 49 | struct chunk* rewrite_buffer_pop(); 50 | struct chunk* rewrite_buffer_top(); 51 | 52 | #endif /* REWRITE_PHASE_H_ */ 53 | -------------------------------------------------------------------------------- /src/index/index.h: -------------------------------------------------------------------------------- 1 | /* 2 | * An index_update() to a fingerprint will be called after the index_search() to the fingerprint. 3 | * However, due to the existence of rewriting algorithms, 4 | * there is no guarantee that the index_update() will be called immediately after the index_search(). 5 | * Thus, we would better make them independent with each other. 6 | * 7 | * The input of index_search is nearly same as that of index_update() except the ContainerId field. 8 | */ 9 | #ifndef INDEX_H_ 10 | #define INDEX_H_ 11 | 12 | #include "../destor.h" 13 | 14 | /* 15 | * The function is used to initialize memory structures of a fingerprint index. 16 | */ 17 | void init_index(); 18 | /* 19 | * Free memory structures and flush them into disks. 20 | */ 21 | void close_index(); 22 | /* 23 | * lookup fingerprints in a segment in index. 24 | */ 25 | int index_lookup(struct segment*); 26 | /* 27 | * Insert/update fingerprints. 28 | */ 29 | void index_update(GHashTable *features, int64_t id); 30 | 31 | void index_delete(fingerprint *fp, int64_t id); 32 | 33 | void index_check_buffer(struct segment *s); 34 | int index_update_buffer(struct segment *s); 35 | 36 | //void index_delete(fingerprint *); 37 | 38 | extern GHashTable* (*sampling)(GSequence *chunks, int32_t chunk_num); 39 | extern struct segment* (*segmenting)(struct chunk *c); 40 | 41 | gboolean g_feature_equal(char* a, char* b); 42 | guint g_feature_hash(char *feature); 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /src/jcr.h: -------------------------------------------------------------------------------- 1 | /* 2 | * jcr.h 3 | * 4 | * Created on: Feb 15, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef Jcr_H_ 9 | #define Jcr_H_ 10 | 11 | #include "destor.h" 12 | #include "recipe/recipestore.h" 13 | 14 | #define JCR_STATUS_INIT 1 15 | #define JCR_STATUS_RUNNING 2 16 | #define JCR_STATUS_DONE 3 17 | 18 | /* job control record */ 19 | struct jcr{ 20 | int32_t id; 21 | /* 22 | * The path of backup or restore. 23 | */ 24 | sds path; 25 | 26 | int status; 27 | 28 | int32_t file_num; 29 | int64_t data_size; 30 | int64_t unique_data_size; 31 | int32_t chunk_num; 32 | int32_t unique_chunk_num; 33 | int32_t zero_chunk_num; 34 | int64_t zero_chunk_size; 35 | int32_t rewritten_chunk_num; 36 | int64_t rewritten_chunk_size; 37 | 38 | int32_t sparse_container_num; 39 | int32_t inherited_sparse_num; 40 | int32_t total_container_num; 41 | 42 | struct backupVersion* bv; 43 | 44 | double total_time; 45 | /* 46 | * the time consuming of six dedup phase 47 | */ 48 | double read_time; 49 | double chunk_time; 50 | double hash_time; 51 | double dedup_time; 52 | double rewrite_time; 53 | double filter_time; 54 | double write_time; 55 | 56 | double read_recipe_time; 57 | double read_chunk_time; 58 | double write_chunk_time; 59 | 60 | int32_t read_container_num; 61 | }; 62 | 63 | extern struct jcr jcr; 64 | 65 | void init_jcr(char *path); 66 | void init_backup_jcr(char *path); 67 | void init_restore_jcr(int revision, char *path); 68 | 69 | #endif /* Jcr_H_ */ 70 | -------------------------------------------------------------------------------- /scripts/edpl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | kernel_fcs=(256 512 1024 2048 4096) 18 | vmdk_fcs=(256 512 1024 2048 4096) 19 | rdb_fcs=(256 512 1024 2048 4096) 20 | synthetic_fcs=(256 512 1024 2048 4096) 21 | 22 | # path: where trace files locate 23 | # fcs: the restore cache size 24 | case $dataset in 25 | "kernel") 26 | path=$kernel_path 27 | fcs=(${kernel_fcs[@]}) 28 | ;; 29 | "vmdk") 30 | path=$vmdk_path 31 | fcs=(${vmdk_fcs[@]}) 32 | ;; 33 | "rdb") 34 | path=$rdb_path 35 | fcs=(${rdb_fcs[@]}) 36 | ;; 37 | "synthetic") 38 | path=$synthetic_path 39 | fcs=(${synthetic_fcs[@]}) 40 | ;; 41 | *) 42 | echo "Wrong dataset!" 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # ./rebuild would clear data of previous experiments 48 | # ./destor executes a backup job 49 | # (results are written to backup.log) 50 | # ./destor -rN executes a restore job under various restore cache size 51 | # (results are written to restore.log) 52 | 53 | for s in ${fcs[@]};do 54 | ../rebuild 55 | for file in $(ls $path);do 56 | ../destor $path/$file -p"fingerprint-index exact physical" -p"fingerprint-index-cache-size $s" >> log 57 | done 58 | ../destor -s >> backup.log 59 | done 60 | -------------------------------------------------------------------------------- /src/backup.h: -------------------------------------------------------------------------------- 1 | /* 2 | * backup.h 3 | * 4 | * Created on: Dec 4, 2013 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef BACKUP_H_ 9 | #define BACKUP_H_ 10 | 11 | #include "destor.h" 12 | #include "utils/sync_queue.h" 13 | 14 | /* 15 | * CHUNK_FILE_START NORMAL_CHUNK... CHUNK_FILE_END 16 | */ 17 | void start_read_phase(); 18 | void stop_read_phase(); 19 | 20 | /* 21 | * Input: Raw data blocks 22 | * Output: Chunks 23 | */ 24 | void start_chunk_phase(); 25 | void stop_chunk_phase(); 26 | /* Input: Chunks 27 | * Output: Hashed Chunks. 28 | */ 29 | void start_hash_phase(); 30 | void stop_hash_phase(); 31 | 32 | void start_read_trace_phase(); 33 | void stop_read_trace_phase(); 34 | 35 | /* 36 | * Duplicate chunks are marked CHUNK_DUPLICATE 37 | */ 38 | void start_dedup_phase(); 39 | void stop_dedup_phase(); 40 | /* 41 | * Fragmented chunks are marked CHUNK_SPARSE, CHUNK_OUT_OF_ORDER or CHUNK_NOT_IN_CACHE. 42 | */ 43 | void start_rewrite_phase(); 44 | void stop_rewrite_phase(); 45 | /* 46 | * Determine which chunks are required to be written according to their flags. 47 | * All unique/rewritten chunks aggregate into containers. 48 | * 49 | * output: containers 50 | */ 51 | void start_filter_phase(); 52 | void stop_filter_phase(); 53 | /* 54 | * Write containers. 55 | */ 56 | void start_append_phase(); 57 | void stop_append_phase(); 58 | 59 | /* Output of read phase. */ 60 | SyncQueue* read_queue; 61 | /* Output of chunk phase. */ 62 | SyncQueue* chunk_queue; 63 | /* Output of hash phase. */ 64 | SyncQueue* hash_queue; 65 | /* Output of trace phase. */ 66 | SyncQueue* trace_queue; 67 | /* Output of dedup phase */ 68 | SyncQueue* dedup_queue; 69 | /* Output of rewrite phase. */ 70 | SyncQueue* rewrite_queue; 71 | 72 | #endif /* BACKUP_H_ */ 73 | -------------------------------------------------------------------------------- /scripts/edll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 2 ];then 4 | echo "dataset <- $1" 5 | dataset=$1 6 | echo "sampling <- $3" 7 | sampling=$3 8 | else 9 | echo "3 parameters are required" 10 | exit 1 11 | fi 12 | 13 | case $2 in 14 | "cds") 15 | echo "segmenting <- content-defined" 16 | segmenting="content-defined" 17 | ;; 18 | "fixed") 19 | echo "segmenting <- fixed" 20 | segmenting="fixed" 21 | ;; 22 | *) 23 | echo "wrong segmenting method!" 24 | exit 1 25 | ;; 26 | esac 27 | 28 | kernel_path="/home/dataset/kernel_8k/" 29 | vmdk_path="/home/dataset/vmdk_4k/" 30 | rdb_path="/home/dataset/rdb_4k/" 31 | synthetic_path="/home/dataset/synthetic_8k/" 32 | 33 | # path: where trace files locate 34 | case $dataset in 35 | "kernel") 36 | path=$kernel_path 37 | ;; 38 | "vmdk") 39 | path=$vmdk_path 40 | ;; 41 | "rdb") 42 | path=$rdb_path 43 | ;; 44 | "synthetic") 45 | path=$synthetic_path 46 | ;; 47 | *) 48 | echo "Wrong dataset!" 49 | exit 1 50 | ;; 51 | esac 52 | 53 | # ./rebuild would clear data of previous experiments 54 | # ./destor executes a backup job 55 | # (results are written to backup.log) 56 | # ./destor -rN executes a restore job under various restore cache size 57 | # (results are written to restore.log) 58 | 59 | # r is the sampling Ratio 60 | for r in 1 16 32 64 128 256;do 61 | ../rebuild 62 | for file in $(ls $path);do 63 | ../destor $path/$file -p"fingerprint-index exact logical" -p"fingerprint-index-segment-algorithm $segmenting 1024" -p"fingerprint-index-sampling-method $sampling $r" >> log 64 | done 65 | ../destor -s >> backup.log 66 | done 67 | -------------------------------------------------------------------------------- /src/jcr.c: -------------------------------------------------------------------------------- 1 | /* 2 | * jcr.cpp 3 | * 4 | * Created on: Feb 15, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #include "jcr.h" 9 | 10 | struct jcr jcr; 11 | 12 | void init_jcr(char *path) { 13 | jcr.path = sdsnew(path); 14 | 15 | struct stat s; 16 | if (stat(path, &s) != 0) { 17 | fprintf(stderr, "backup path does not exist!"); 18 | exit(1); 19 | } 20 | if (S_ISDIR(s.st_mode) && jcr.path[sdslen(jcr.path) - 1] != '/') 21 | jcr.path = sdscat(jcr.path, "/"); 22 | 23 | jcr.bv = NULL; 24 | 25 | jcr.id = TEMPORARY_ID; 26 | 27 | jcr.status = JCR_STATUS_INIT; 28 | 29 | jcr.file_num = 0; 30 | jcr.data_size = 0; 31 | jcr.unique_data_size = 0; 32 | jcr.chunk_num = 0; 33 | jcr.unique_chunk_num = 0; 34 | jcr.zero_chunk_num = 0; 35 | jcr.zero_chunk_size = 0; 36 | jcr.rewritten_chunk_num = 0; 37 | jcr.rewritten_chunk_size = 0; 38 | 39 | jcr.sparse_container_num = 0; 40 | jcr.inherited_sparse_num = 0; 41 | jcr.total_container_num = 0; 42 | 43 | jcr.total_time = 0; 44 | /* 45 | * the time consuming of seven backup phase 46 | */ 47 | jcr.read_time = 0; 48 | jcr.chunk_time = 0; 49 | jcr.hash_time = 0; 50 | jcr.dedup_time = 0; 51 | jcr.rewrite_time = 0; 52 | jcr.filter_time = 0; 53 | jcr.write_time = 0; 54 | 55 | /* 56 | * the time consuming of three restore phase 57 | */ 58 | jcr.read_recipe_time = 0; 59 | jcr.read_chunk_time = 0; 60 | jcr.write_chunk_time = 0; 61 | 62 | jcr.read_container_num = 0; 63 | } 64 | 65 | void init_backup_jcr(char *path) { 66 | 67 | init_jcr(path); 68 | 69 | jcr.bv = create_backup_version(jcr.path); 70 | 71 | jcr.id = jcr.bv->bv_num; 72 | } 73 | 74 | void init_restore_jcr(int revision, char *path) { 75 | 76 | init_jcr(path); 77 | 78 | jcr.bv = open_backup_version(revision); 79 | 80 | if(jcr.bv->deleted == 1){ 81 | WARNING("The backup has been deleted!"); 82 | exit(1); 83 | } 84 | 85 | jcr.id = revision; 86 | } 87 | -------------------------------------------------------------------------------- /scripts/cost.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 0 ];then 4 | echo "dataset <- $1" 5 | dataset=$1 6 | else 7 | echo "1 parameters are required" 8 | exit 1 9 | fi 10 | 11 | kernel_path="/home/dataset/kernel_8k/" 12 | vmdk_path="/home/dataset/vmdk_4k/" 13 | rdb_path="/home/dataset/rdb_4k/" 14 | synthetic_path="/home/dataset/synthetic_8k/" 15 | 16 | # path: where trace files locate 17 | case $dataset in 18 | "kernel") 19 | path=$kernel_path 20 | ;; 21 | "vmdk") 22 | path=$vmdk_path 23 | ;; 24 | "rdb") 25 | path=$rdb_path 26 | ;; 27 | "synthetic") 28 | path=$synthetic_path 29 | ;; 30 | *) 31 | echo "Wrong dataset!" 32 | exit 1 33 | ;; 34 | esac 35 | 36 | # ./rebuild would clear data of previous experiments 37 | # ./destor executes a backup job 38 | # (results are written to backup.log) 39 | # ./destor -rN executes a restore job under various restore cache size 40 | # (results are written to restore.log) 41 | 42 | # r is the sampling Ratio 43 | for r in 64 128 256;do 44 | ../rebuild 45 | for file in $(ls $path);do 46 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method random $r" -p"fingerprint-index-segment-selection top 4" -p"fingerprint-index-segment-prefetching 4" -p"fingerprint-index-value-length 1" >> log 47 | done 48 | ../destor -s >> backup.log 49 | done 50 | 51 | for r in 64 128 256;do 52 | ../rebuild 53 | for file in $(ls $path);do 54 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method min $r" -p"fingerprint-index-segment-selection top 4" -p"fingerprint-index-segment-prefetching 4" -p"fingerprint-index-value-length 1" >> log 55 | done 56 | ../destor -s >> backup.log 57 | done 58 | -------------------------------------------------------------------------------- /src/restore_aware.c: -------------------------------------------------------------------------------- 1 | /** 2 | * @file cfl_monitor.c 3 | * @Synopsis only used in cfl_monitor 4 | * @author fumin, fumin@hust.edu.cn 5 | * @version 1 6 | * @date 2012-12-12 7 | */ 8 | #include "rewrite_phase.h" 9 | #include "storage/containerstore.h" 10 | #include "utils/lru_cache.h" 11 | 12 | struct { 13 | int64_t total_size; 14 | 15 | int ocf; //data amount/CONTAINER_SIZE 16 | int ccf; 17 | double cfl; //ocf/ccf 18 | 19 | struct lruCache *cache; 20 | } monitor; 21 | 22 | static int container_record_check_id(struct containerRecord* a, 23 | containerid *id){ 24 | return a->cid == *id ? 1 : 0; 25 | } 26 | 27 | /*static int container_record_equal(struct containerRecord* a, 28 | struct containerRecord* b) { 29 | return a->cid == b->cid ? 1 : 0; 30 | }*/ 31 | 32 | void init_restore_aware() { 33 | monitor.total_size = 0; 34 | 35 | monitor.ccf = 0; 36 | monitor.ocf = 0; 37 | monitor.cfl = 0; 38 | monitor.cache = new_lru_cache(destor.restore_cache[1], free, 39 | container_record_check_id); 40 | } 41 | 42 | /* 43 | * Maintain a LRU cache internally to simulate recovery process when backing-up. 44 | */ 45 | void restore_aware_update(containerid id, int32_t chunklen) { 46 | monitor.total_size += chunklen + CONTAINER_META_ENTRY; 47 | 48 | struct containerRecord* record = lru_cache_lookup(monitor.cache, &id); 49 | if (!record) { 50 | record = (struct containerRecord*) malloc( 51 | sizeof(struct containerRecord)); 52 | record->cid = id; 53 | lru_cache_insert(monitor.cache, record, NULL, NULL); 54 | 55 | monitor.ccf++; 56 | } 57 | 58 | monitor.ocf = (monitor.total_size + CONTAINER_SIZE - 1) / CONTAINER_SIZE; 59 | monitor.cfl = monitor.ocf / (double) monitor.ccf; 60 | } 61 | 62 | int restore_aware_contains(containerid id) { 63 | return lru_cache_lookup_without_update(monitor.cache, &id) ? 1 : 0; 64 | } 65 | 66 | double restore_aware_get_cfl() { 67 | return monitor.cfl > 1 ? 1 : monitor.cfl; 68 | } 69 | -------------------------------------------------------------------------------- /scripts/ndll_spv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 0 ];then 4 | echo "dataset <- $1" 5 | dataset=$1 6 | else 7 | echo "1 parameters are required" 8 | exit 1 9 | fi 10 | 11 | kernel_path="/home/dataset/kernel_8k/" 12 | vmdk_path="/home/dataset/vmdk_4k/" 13 | rdb_path="/home/dataset/rdb_4k/" 14 | synthetic_path="/home/dataset/synthetic_8k/" 15 | 16 | # path: where trace files locate 17 | case $dataset in 18 | "kernel") 19 | path=$kernel_path 20 | ;; 21 | "vmdk") 22 | path=$vmdk_path 23 | ;; 24 | "rdb") 25 | path=$rdb_path 26 | ;; 27 | "synthetic") 28 | path=$synthetic_path 29 | ;; 30 | *) 31 | echo "Wrong dataset!" 32 | exit 1 33 | ;; 34 | esac 35 | 36 | # ./rebuild would clear data of previous experiments 37 | # ./destor executes a backup job 38 | # (results are written to backup.log) 39 | # ./destor -rN executes a restore job under various restore cache size 40 | # (results are written to restore.log) 41 | 42 | # r is the sampling Ratio 43 | for p in 1 2 4;do 44 | ../rebuild 45 | for file in $(ls $path);do 46 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method random 128" -p"fingerprint-index-segment-selection base" -p"fingerprint-index-segment-prefetching $p" >> log 47 | done 48 | ../destor -s >> backup.log 49 | done 50 | 51 | for s in 1 2 4;do 52 | for p in 1 2 4;do 53 | for v in 1 2 4;do 54 | ../rebuild 55 | for file in $(ls $path);do 56 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method random 128" -p"fingerprint-index-segment-selection top $s" -p"fingerprint-index-segment-prefetching $p" -p"fingerprint-index-value-length $v" >> log 57 | done 58 | ../destor -s >> backup.log 59 | done 60 | done 61 | done 62 | -------------------------------------------------------------------------------- /src/storage/containerstore.h: -------------------------------------------------------------------------------- 1 | /* 2 | * containerstore.h 3 | * 4 | * Created on: Nov 11, 2013 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef CONTAINERSTORE_H_ 9 | #define CONTAINERSTORE_H_ 10 | 11 | #include "../destor.h" 12 | 13 | #define CONTAINER_SIZE (4194304ll) //4MB 14 | #define CONTAINER_META_SIZE (32768ll) //32KB 15 | #define CONTAINER_HEAD 16 16 | #define CONTAINER_META_ENTRY 28 17 | 18 | struct containerMeta { 19 | containerid id; 20 | int32_t data_size; 21 | int32_t chunk_num; 22 | 23 | /* Map fingerprints to chunk offsets. */ 24 | GHashTable *map; 25 | }; 26 | 27 | struct container { 28 | struct containerMeta meta; 29 | unsigned char *data; 30 | }; 31 | 32 | void init_container_store(); 33 | void close_container_store(); 34 | 35 | struct container* create_container(); 36 | 37 | void write_container(struct container*); 38 | void write_container_async(struct container*); 39 | struct container* retrieve_container_by_id(containerid); 40 | struct containerMeta* retrieve_container_meta_by_id(containerid); 41 | struct containerMeta* retrieve_container_meta_by_id_async(containerid); 42 | 43 | struct chunk* get_chunk_in_container(struct container*, fingerprint*); 44 | int add_chunk_to_container(struct container*, struct chunk*); 45 | int container_overflow(struct container*, int32_t size); 46 | void free_container(struct container*); 47 | void free_container_meta(struct containerMeta*); 48 | containerid get_container_id(struct container* c); 49 | int container_empty(struct container* c); 50 | 51 | gint g_container_cmp_desc(struct container*, struct container*, gpointer); 52 | gint g_container_meta_cmp_desc(struct containerMeta*, struct containerMeta*, 53 | gpointer); 54 | 55 | int lookup_fingerprint_in_container(struct container*, fingerprint *); 56 | int lookup_fingerprint_in_container_meta(struct containerMeta*, fingerprint *); 57 | int container_check_id(struct container*, containerid*); 58 | int container_meta_check_id(struct containerMeta*, containerid*); 59 | 60 | void container_meta_foreach(struct containerMeta* cm, void (*func)(fingerprint*, void*), void* data); 61 | 62 | #endif /* CONTAINERSTORE_H_ */ 63 | -------------------------------------------------------------------------------- /scripts/restore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | # path: where trace files locate 18 | # fcs: the restore cache size 19 | case $dataset in 20 | "kernel") 21 | path=$kernel_path 22 | rcs=128 23 | ;; 24 | "vmdk") 25 | path=$vmdk_path 26 | rcs=1024 27 | ;; 28 | "rdb") 29 | path=$rdb_path 30 | rcs=1024 31 | ;; 32 | "synthetic") 33 | path=$synthetic_path 34 | rcs=1024 35 | ;; 36 | *) 37 | echo "Wrong dataset!" 38 | exit 1 39 | ;; 40 | esac 41 | 42 | # ./rebuild would clear data of previous experiments 43 | # ./destor executes a backup job 44 | # (results are written to backup.log) 45 | # ./destor -rN executes a restore job under various restore cache size 46 | # (results are written to restore.log) 47 | i=0 48 | ../rebuild 49 | for file in $(ls $path);do 50 | ../destor $path/$file -p"fingerprint-index exact physical" >> log 51 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 52 | i=$(($i+1)) 53 | done 54 | ../destor -s >> backup.log 55 | 56 | i=0 57 | ../rebuild 58 | for file in $(ls $path);do 59 | ../destor $path/$file -p"fingerprint-index exact physical" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.4" -p"rewrite-har-rewrite-limit 1" >> log 60 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 61 | i=$(($i+1)) 62 | done 63 | ../destor -s >> backup.log 64 | 65 | for r in 128 256 512;do 66 | i=0 67 | ../rebuild 68 | for file in $(ls $path);do 69 | ../destor $path/$file -p"fingerprint-index near-exact physical" -p"fingerprint-index-sampling-method uniform $r " >> log 70 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 71 | i=$(($i+1)) 72 | done 73 | ../destor -s >> backup.log 74 | done 75 | -------------------------------------------------------------------------------- /scripts/hybrid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | kernel_rcs=(4 8 16 32 64 128 256 ) 18 | vmdk_rcs=(64 128 256 512 1024 2048 4096) 19 | rdb_rcs=(8 16 32 64 128 256 512) 20 | synthetic_rcs=(8 16 32 64 128 256 512) 21 | 22 | # path: where trace files locate 23 | # rcs: the restore cache size 24 | case $dataset in 25 | "kernel") 26 | path=$kernel_path 27 | rcs=(${kernel_rcs[@]}) 28 | ;; 29 | "vmdk") 30 | path=$vmdk_path 31 | rcs=(${vmdk_rcs[@]}) 32 | ;; 33 | "rdb") 34 | path=$rdb_path 35 | rcs=(${rdb_rcs[@]}) 36 | ;; 37 | "synthetic") 38 | path=$synthetic_path 39 | rcs=(${synthetic_rcs[@]}) 40 | ;; 41 | *) 42 | echo "Wrong dataset!" 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # ./rebuild would clear data of previous experiments 48 | # ./destor executes a backup job 49 | # (results are written to backup.log) 50 | # ./destor -rN executes a restore job under various restore cache size 51 | # (results are written to restore.log) 52 | 53 | for c in ${rcs[@]};do 54 | n=0 55 | ../rebuild 56 | for file in $(ls $path);do 57 | ../destor $path/$file -p"restore-cache lru $c" >> log 58 | ../destor -r$n /home/fumin/restore -p"restore-cache lru $c" >> log 59 | ../destor -r$n /home/fumin/restore -p"restore-cache opt $c" >> log 60 | n=$(($n+1)) 61 | done 62 | ../destor -s >> backup.log 63 | 64 | done 65 | 66 | # split the restore.log according to the restore cache size 67 | split_file(){ 68 | lines=$(cat $1) # read the file 69 | IFS=$'\n' # split 'lines' by '\n' 70 | lineno=0 71 | for line in $lines; do 72 | index=$(( ($lineno)%2 )) 73 | if [ $(($lineno%2)) -eq 0 ];then 74 | echo $line >> restore.lru.log 75 | else 76 | echo $line >> restore.opt.log 77 | fi 78 | lineno=$(($lineno+1)) 79 | done 80 | } 81 | 82 | split_file restore.log 83 | 84 | -------------------------------------------------------------------------------- /scripts/rewrite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | kernel_rcs=(4 8 16 32 64 128 256 ) 18 | vmdk_rcs=(64 128 256 512 1024 2048 4096) 19 | rdb_rcs=(8 16 32 64 128 256 512) 20 | synthetic_rcs=(8 16 32 64 128 256 512) 21 | 22 | # path: where trace files locate 23 | # rcs: the restore cache size 24 | case $dataset in 25 | "kernel") 26 | path=$kernel_path 27 | rcs=(${kernel_rcs[@]}) 28 | ;; 29 | "vmdk") 30 | path=$vmdk_path 31 | rcs=(${vmdk_rcs[@]}) 32 | ;; 33 | "rdb") 34 | path=$rdb_path 35 | rcs=(${rdb_rcs[@]}) 36 | ;; 37 | "synthetic") 38 | path=$synthetic_path 39 | rcs=(${synthetic_rcs[@]}) 40 | ;; 41 | *) 42 | echo "Wrong dataset!" 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # ./rebuild would clear data of previous experiments 48 | # ./destor executes a backup job 49 | # (results are written to backup.log) 50 | # ./destor -rN executes a restore job under various restore cache size 51 | # (results are written to restore.log) 52 | 53 | n=0 54 | ../rebuild 55 | for file in $(ls $path);do 56 | ../destor $path/$file >> log 57 | for s in ${rcs[@]};do 58 | ../destor -r$n /home/fumin/restore -p"restore-cache lru $s" >> log 59 | ../destor -r$n /home/fumin/restore -p"restore-cache opt $s" >> log 60 | done 61 | n=$(($n+1)) 62 | done 63 | ../destor -s >> backup.log 64 | 65 | # split the restore.log according to the restore cache size 66 | split_file(){ 67 | lines=$(cat $1) # read the file 68 | IFS=$'\n' # split 'lines' by '\n' 69 | lineno=0 70 | for line in $lines; do 71 | index=$(( ($lineno/2)%${#rcs[@]} )) 72 | if [ $(($lineno%2)) -eq 0 ];then 73 | echo $line >> restore.lru${rcs[$index]}.log 74 | else 75 | echo $line >> restore.opt${rcs[$index]}.log 76 | fi 77 | lineno=$(($lineno+1)) 78 | done 79 | } 80 | 81 | split_file restore.log 82 | -------------------------------------------------------------------------------- /scripts/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | kernel_rcs=(4 8 16 32 64 128 256 ) 18 | vmdk_rcs=(64 128 256 512 1024 2048 4096) 19 | rdb_rcs=(8 16 32 64 128 256 512) 20 | synthetic_rcs=(8 16 32 64 128 256 512) 21 | 22 | # path: where trace files locate 23 | # rcs: the restore cache size 24 | case $dataset in 25 | "kernel") 26 | path=$kernel_path 27 | rcs=(${kernel_rcs[@]}) 28 | ;; 29 | "vmdk") 30 | path=$vmdk_path 31 | rcs=(${vmdk_rcs[@]}) 32 | ;; 33 | "rdb") 34 | path=$rdb_path 35 | rcs=(${rdb_rcs[@]}) 36 | ;; 37 | "synthetic") 38 | path=$synthetic_path 39 | rcs=(${synthetic_rcs[@]}) 40 | ;; 41 | *) 42 | echo "Wrong dataset!" 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # ./rebuild would clear data of previous experiments 48 | # ./destor executes a backup job 49 | # (results are written to backup.log) 50 | # ./destor -rN executes a restore job under various restore cache size 51 | # (results are written to restore.log) 52 | 53 | n=0 54 | ../rebuild 55 | for file in $(ls $path);do 56 | ../destor $path/$file >> log 57 | for s in ${rcs[@]};do 58 | ../destor -r$n /home/fumin/restore -p"restore-cache lru $s" >> log 59 | ../destor -r$n /home/fumin/restore -p"restore-cache opt $s" >> log 60 | done 61 | n=$(($n+1)) 62 | done 63 | ../destor -s >> backup.log 64 | 65 | # split the restore.log according to the restore cache size 66 | split_file(){ 67 | lines=$(cat $1) # read the file 68 | IFS=$'\n' # split 'lines' by '\n' 69 | lineno=0 70 | for line in $lines; do 71 | index=$(( ($lineno/2)%${#rcs[@]} )) 72 | if [ $(($lineno%2)) -eq 0 ];then 73 | echo $line >> restore.lru${rcs[$index]}.log 74 | else 75 | echo $line >> restore.opt${rcs[$index]}.log 76 | fi 77 | lineno=$(($lineno+1)) 78 | done 79 | } 80 | 81 | split_file restore.log 82 | -------------------------------------------------------------------------------- /scripts/caf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | kernel_rcs=(4 8 16 32 64 128 256 ) 18 | vmdk_rcs=(64 128 256 512 1024 2048 4096) 19 | rdb_rcs=(8 16 32 64 128 256 512) 20 | synthetic_rcs=(8 16 32 64 128 256 512) 21 | 22 | # path: where trace files locate 23 | # rcs: the restore cache size 24 | case $dataset in 25 | "kernel") 26 | path=$kernel_path 27 | rcs=(${kernel_rcs[@]}) 28 | ;; 29 | "vmdk") 30 | path=$vmdk_path 31 | rcs=(${vmdk_rcs[@]}) 32 | ;; 33 | "rdb") 34 | path=$rdb_path 35 | rcs=(${rdb_rcs[@]}) 36 | ;; 37 | "synthetic") 38 | path=$synthetic_path 39 | rcs=(${synthetic_rcs[@]}) 40 | ;; 41 | *) 42 | echo "Wrong dataset!" 43 | exit 1 44 | ;; 45 | esac 46 | 47 | # ./rebuild would clear data of previous experiments 48 | # ./destor executes a backup job 49 | # (results are written to backup.log) 50 | # ./destor -rN executes a restore job under various restore cache size 51 | # (results are written to restore.log) 52 | 53 | for c in ${rcs[@]};do 54 | n=0 55 | ../rebuild 56 | for file in $(ls $path);do 57 | ../destor $path/$file -p"restore-cache lru $c" >> log 58 | for s in ${rcs[@]};do 59 | ../destor -r$n /home/fumin/restore -p"restore-cache lru $s" >> log 60 | ../destor -r$n /home/fumin/restore -p"restore-cache opt $s" >> log 61 | done 62 | n=$(($n+1)) 63 | done 64 | ../destor -s >> backup.log 65 | 66 | # split the restore.log according to the restore cache size 67 | split_file(){ 68 | lines=$(cat $1) # read the file 69 | IFS=$'\n' # split 'lines' by '\n' 70 | lineno=0 71 | for line in $lines; do 72 | index=$(( ($lineno/2)%${#rcs[@]} )) 73 | if [ $(($lineno%2)) -eq 0 ];then 74 | echo $line >> ${c}restore.lru${rcs[$index]}.log 75 | else 76 | echo $line >> ${c}restore.opt${rcs[$index]}.log 77 | fi 78 | lineno=$(($lineno+1)) 79 | done 80 | } 81 | 82 | split_file restore.log 83 | 84 | rm restore.log 85 | done 86 | -------------------------------------------------------------------------------- /scripts/ndll.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -gt 0 ];then 4 | echo "dataset <- $1" 5 | dataset=$1 6 | else 7 | echo "1 parameters are required" 8 | exit 1 9 | fi 10 | 11 | kernel_path="/home/dataset/kernel_8k/" 12 | vmdk_path="/home/dataset/vmdk_4k/" 13 | rdb_path="/home/dataset/rdb_4k/" 14 | synthetic_path="/home/dataset/synthetic_8k/" 15 | 16 | # path: where trace files locate 17 | case $dataset in 18 | "kernel") 19 | path=$kernel_path 20 | ;; 21 | "vmdk") 22 | path=$vmdk_path 23 | ;; 24 | "rdb") 25 | path=$rdb_path 26 | ;; 27 | "synthetic") 28 | path=$synthetic_path 29 | ;; 30 | *) 31 | echo "Wrong dataset!" 32 | exit 1 33 | ;; 34 | esac 35 | 36 | # ./rebuild would clear data of previous experiments 37 | # ./destor executes a backup job 38 | # (results are written to backup.log) 39 | # ./destor -rN executes a restore job under various restore cache size 40 | # (results are written to restore.log) 41 | 42 | # r is the sampling Ratio 43 | for r in 16 32 64 128 256 512;do 44 | ../rebuild 45 | for file in $(ls $path);do 46 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method uniform $r" -p"fingerprint-index-segment-selection base" >> log 47 | done 48 | ../destor -s >> backup.log 49 | done 50 | 51 | for r in 16 32 64 128 256 512;do 52 | ../rebuild 53 | for file in $(ls $path);do 54 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method random $r" -p"fingerprint-index-segment-selection base" >> log 55 | done 56 | ../destor -s >> backup.log 57 | done 58 | 59 | for r in 16 32 64 128 256 512;do 60 | ../rebuild 61 | for file in $(ls $path);do 62 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method min $r" -p"fingerprint-index-segment-selection top 1" >> log 63 | done 64 | ../destor -s >> backup.log 65 | done 66 | 67 | for r in 16 32 64 128 256 512;do 68 | ../rebuild 69 | for file in $(ls $path);do 70 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-segment-algorithm content-defined 1024" -p"fingerprint-index-sampling-method random $r" -p"fingerprint-index-segment-selection top 1" >> log 71 | done 72 | ../destor -s >> backup.log 73 | done 74 | -------------------------------------------------------------------------------- /configure.ac: -------------------------------------------------------------------------------- 1 | # -*- Autoconf -*- 2 | # Process this file with autoconf to produce a configure script. 3 | 4 | AC_PREREQ([2.67]) 5 | AC_INIT([destor], [2.0], [fumin@hust.edu.cn]) 6 | AM_INIT_AUTOMAKE([destor], [2.0]) 7 | 8 | # Checks for programs. 9 | AC_PROG_CC 10 | CFLAGS="-g -O2 -Wall" 11 | 12 | # Checks for libraries. 13 | AC_CHECK_LIB([pthread], [pthread_create]) 14 | AC_CHECK_LIB([glib], [g_hash_table_new],[found_glib=yes],[found_glib=no]) 15 | if test "$found_glib" = "no"; then 16 | AC_MSG_ERROR([*** Working glib library not found ***]) 17 | fi 18 | AC_CHECK_LIB([glib], [g_thread_init],[thread_safe=no],[thread_safe=yes]) 19 | if test "$thread_safe" = "no"; then 20 | AC_MSG_ERROR([*** The version of glib is too low ***]) 21 | fi 22 | AC_CHECK_LIB([crypto],[SHA_Update],[found_crypto=yes],[found_crypto=no]) 23 | if test "$found_crypto" = "no"; then 24 | AC_MSG_ERROR([*** Working crypto library not found ***]) 25 | fi 26 | #AC_CHECK_LIB([mysqlclient],[mysql_init],[found_mysql=yes],[found_mysql=no]) 27 | #if test "$found_mysql" = "no"; then 28 | #AC_MSG_ERROR([*** Working mysql library not found ***]) 29 | #fi 30 | AC_PROG_RANLIB 31 | 32 | # Checks for header files. 33 | AC_CHECK_HEADERS([arpa/inet.h fcntl.h stdint.h stdlib.h string.h sys/socket.h sys/time.h unistd.h]) 34 | AC_CHECK_HEADERS([glib.h],[found_glib_h=yes],[found_glib_h=no]) 35 | if test "$found_glib_h" = "no"; then 36 | AC_MSG_ERROR([*** Working glib.h header not found ***]) 37 | fi 38 | AC_CHECK_HEADERS([openssl/sha.h], [found_sha_h=yes],[found_sha_h=no]) 39 | if test "$found_sha_h" = "no"; then 40 | AC_MSG_ERROR([*** Working sha.h header not found ***]) 41 | fi 42 | #AC_CHECK_HEADERS([mysql/mysql.h], [found_mysql_h=yes],[found_mysql_h=no]) 43 | #if test "$found_mysql_h" = "no"; then 44 | #AC_MSG_ERROR([*** Working mysql/mysql.h header not found ***]) 45 | #fi 46 | 47 | # Checks for typedefs, structures, and compiler characteristics. 48 | AC_C_INLINE 49 | AC_TYPE_INT32_T 50 | AC_TYPE_INT64_T 51 | AC_TYPE_OFF_T 52 | AC_TYPE_SIZE_T 53 | AC_TYPE_UINT32_T 54 | AC_TYPE_UINT64_T 55 | AC_TYPE_UINT8_T 56 | 57 | # Checks for library functions. 58 | AC_FUNC_MALLOC 59 | #AC_CHECK_FUNCS([gettimeofday memmove memset socket strerror]) 60 | 61 | AC_OUTPUT(Makefile 62 | src/Makefile 63 | src/utils/Makefile 64 | src/index/Makefile 65 | src/recipe/Makefile 66 | src/storage/Makefile 67 | src/chunking/Makefile 68 | src/fsl/Makefile) 69 | -------------------------------------------------------------------------------- /scripts/cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | # path: where trace files locate 18 | # fcs: the restore cache size 19 | case $dataset in 20 | "kernel") 21 | path=$kernel_path 22 | rcs=32 23 | ;; 24 | "vmdk") 25 | path=$vmdk_path 26 | rcs=256 27 | ;; 28 | "rdb") 29 | path=$rdb_path 30 | rcs=64 31 | ;; 32 | "synthetic") 33 | path=$synthetic_path 34 | rcs=64 35 | ;; 36 | *) 37 | echo "Wrong dataset!" 38 | exit 1 39 | ;; 40 | esac 41 | 42 | # split the restore.log according to the restore cache size 43 | split_file(){ 44 | lines=$(cat $1) # read the file 45 | IFS=$'\n' # split 'lines' by '\n' 46 | lineno=0 47 | for line in $lines; do 48 | if [ $(($lineno%3)) -eq 0 ];then 49 | echo $line >> restore.lru.log 50 | elif [ $(($lineno%3)) -eq 1 ];then 51 | echo $line >> restore.opt.log 52 | else 53 | echo $line >> restore.asm.log 54 | fi 55 | lineno=$(($lineno+1)) 56 | done 57 | } 58 | 59 | # ./rebuild would clear data of previous experiments 60 | # ./destor executes a backup job 61 | # (results are written to backup.log) 62 | # ./destor -rN executes a restore job under various restore cache size 63 | # (results are written to restore.log) 64 | i=0 65 | ../rebuild 66 | for file in $(ls $path);do 67 | ../destor $path/$file -p"fingerprint-index exact physical" >> log 68 | ../destor -r$i /home/fumin/restore -p"restore-cache lru $rcs" >> log 69 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 70 | ../destor -r$i /home/fumin/restore -p"restore-cache asm $rcs" >> log 71 | i=$(($i+1)) 72 | done 73 | ../destor -s >> backup.log 74 | 75 | split_file restore.log 76 | rm restore.log 77 | 78 | i=0 79 | ../rebuild 80 | for file in $(ls $path);do 81 | ../destor $path/$file -p"fingerprint-index exact physical" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.5" >> log 82 | ../destor -r$i /home/fumin/restore -p"restore-cache lru $rcs" >> log 83 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 84 | ../destor -r$i /home/fumin/restore -p"restore-cache asm $rcs" >> log 85 | i=$(($i+1)) 86 | done 87 | ../destor -s >> backup.log 88 | 89 | split_file restore.log 90 | -------------------------------------------------------------------------------- /src/cfl_rewrite.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "rewrite_phase.h" 4 | #include "storage/containerstore.h" 5 | #include "backup.h" 6 | 7 | static int64_t chunk_num; 8 | 9 | /* --------------------------------------------------------------------------*/ 10 | /** 11 | * @Synopsis Assuring Demanded Read Performance of Data Deduplication Storage 12 | * with Backup Datasets. In MASCOTS'12. 13 | * 14 | * @Param arg 15 | * 16 | * @Returns 17 | */ 18 | /* ----------------------------------------------------------------------------*/ 19 | void *cfl_rewrite(void* arg) { 20 | /* 21 | * A chunk with an ID that is different from the chunks in buffer, 22 | * or a NULL pointer, 23 | * indicates a segment boundary (return 1). 24 | */ 25 | containerid last_id = TEMPORARY_ID; 26 | int buffer_full = 0; 27 | while (1) { 28 | struct chunk* c = sync_queue_pop(dedup_queue); 29 | if (c == NULL) { 30 | /* The end */ 31 | break; 32 | } 33 | 34 | if ((last_id != TEMPORARY_ID && last_id != c->id) || buffer_full == 1) { 35 | /* judge */ 36 | int out_of_order = rewrite_buffer.size 37 | < destor.rewrite_cfl_usage_threshold 38 | * (CONTAINER_SIZE - CONTAINER_META_SIZE); 39 | 40 | struct chunk* bc; 41 | while ((bc = rewrite_buffer_pop())) { 42 | if (CHECK_CHUNK(bc, CHUNK_FILE_START) || CHECK_CHUNK(bc, CHUNK_FILE_END) 43 | || CHECK_CHUNK(bc, CHUNK_SEGMENT_START) 44 | || CHECK_CHUNK(bc, CHUNK_SEGMENT_END)) { 45 | sync_queue_push(rewrite_queue, bc); 46 | continue; 47 | } 48 | 49 | if (out_of_order && bc->id != TEMPORARY_ID) { 50 | assert(CHECK_CHUNK(bc, CHUNK_DUPLICATE)); 51 | SET_CHUNK(bc, CHUNK_OUT_OF_ORDER); 52 | VERBOSE( 53 | "Rewrite phase: %lldth chunk is in out-of-order container %lld", 54 | chunk_num, bc->id); 55 | } 56 | chunk_num++; 57 | sync_queue_push(rewrite_queue, bc); 58 | } 59 | buffer_full = 0; 60 | } 61 | 62 | last_id = c->id; 63 | if (rewrite_buffer_push(c)) { 64 | buffer_full = 1; 65 | } 66 | } 67 | 68 | int out_of_order = rewrite_buffer.size 69 | < destor.rewrite_cfl_usage_threshold * (CONTAINER_SIZE - CONTAINER_META_SIZE); 70 | 71 | struct chunk* bc; 72 | while ((bc = rewrite_buffer_pop())) { 73 | if (CHECK_CHUNK(bc, CHUNK_FILE_START) || CHECK_CHUNK(bc, CHUNK_FILE_END) 74 | || CHECK_CHUNK(bc, CHUNK_SEGMENT_START) 75 | || CHECK_CHUNK(bc, CHUNK_SEGMENT_END)) { 76 | sync_queue_push(rewrite_queue, bc); 77 | continue; 78 | } 79 | 80 | if (out_of_order && bc->id != TEMPORARY_ID) { 81 | assert(CHECK_CHUNK(bc, CHUNK_DUPLICATE)); 82 | SET_CHUNK(bc, CHUNK_OUT_OF_ORDER); 83 | VERBOSE("Rewrite phase: %lldth chunk is in out-of-order container %lld", 84 | chunk_num, bc->id); 85 | } 86 | chunk_num++; 87 | sync_queue_push(rewrite_queue, bc); 88 | } 89 | buffer_full = 0; 90 | 91 | sync_queue_term(rewrite_queue); 92 | return NULL; 93 | } 94 | -------------------------------------------------------------------------------- /scripts/interplay.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dataset="kernel" 4 | 5 | if [ $# -gt 0 ];then 6 | echo "dataset <- $1" 7 | dataset=$1 8 | else 9 | echo "default dataset <- $dataset" 10 | fi 11 | 12 | kernel_path="/home/dataset/kernel_8k/" 13 | vmdk_path="/home/dataset/vmdk_4k/" 14 | rdb_path="/home/dataset/rdb_4k/" 15 | synthetic_path="/home/dataset/synthetic_8k/" 16 | 17 | # path: where trace files locate 18 | # fcs: the restore cache size 19 | case $dataset in 20 | "kernel") 21 | path=$kernel_path 22 | rcs=128 23 | ;; 24 | "vmdk") 25 | path=$vmdk_path 26 | rcs=1024 27 | ;; 28 | "rdb") 29 | path=$rdb_path 30 | rcs=1024 31 | ;; 32 | "synthetic") 33 | path=$synthetic_path 34 | rcs=1024 35 | ;; 36 | *) 37 | echo "Wrong dataset!" 38 | exit 1 39 | ;; 40 | esac 41 | 42 | # ./rebuild would clear data of previous experiments 43 | # ./destor executes a backup job 44 | # (results are written to backup.log) 45 | # ./destor -rN executes a restore job under various restore cache size 46 | # (results are written to restore.log) 47 | 48 | # EDPL+HAR 49 | i=0 50 | ../rebuild 51 | for file in $(ls $path);do 52 | ../destor $path/$file -p"fingerprint-index exact physical" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.4" -p"rewrite-har-rewrite-limit 1" >> log 53 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 54 | i=$(($i+1)) 55 | done 56 | ../destor -s >> backup.log 57 | 58 | # NDPL+HAR 59 | i=0 60 | ../rebuild 61 | for file in $(ls $path);do 62 | ../destor $path/$file -p"fingerprint-index near-exact physical" -p"fingerprint-index-sampling-method uniform 128" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.4" -p"rewrite-har-rewrite-limit 1" >> log 63 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 64 | i=$(($i+1)) 65 | done 66 | ../destor -s >> backup.log 67 | 68 | # EDLL+HAR 69 | i=0 70 | ../rebuild 71 | for file in $(ls $path);do 72 | ../destor $path/$file -p"fingerprint-index exact logical" -p"fingerprint-index-sampling-method random 256" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.4" -p"rewrite-har-rewrite-limit 1" >> log 73 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 74 | i=$(($i+1)) 75 | done 76 | ../destor -s >> backup.log 77 | 78 | # NDLL+HAR 79 | i=0 80 | ../rebuild 81 | for file in $(ls $path);do 82 | ../destor $path/$file -p"fingerprint-index near-exact logical" -p"fingerprint-index-sampling-method random 128" -p"fingerprint-index-segment-selection top 4" -p"fingerprint-index-segment-prefetching 4" -p"rewrite-enable-har yes" -p"rewrite-har-utilization-threshold 0.4" -p"rewrite-har-rewrite-limit 1" >> log 83 | ../destor -r$i /home/fumin/restore -p"restore-cache opt $rcs" >> log 84 | i=$(($i+1)) 85 | done 86 | ../destor -s >> backup.log 87 | -------------------------------------------------------------------------------- /src/utils/queue.c: -------------------------------------------------------------------------------- 1 | /* 2 | * queue.c 3 | * 4 | * Created on: May 21, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #include 9 | #include 10 | #include "queue.h" 11 | 12 | Queue* queue_new() { 13 | Queue *queue = (Queue*) malloc(sizeof(Queue)); 14 | queue->first = queue->last = 0; 15 | queue->elem_num = 0; 16 | return queue; 17 | } 18 | 19 | void queue_init(Queue *queue) { 20 | queue->first = queue->last = 0; 21 | queue->elem_num = 0; 22 | } 23 | 24 | void queue_empty(Queue *queue, void (*free_data)(void*)) { 25 | while (queue->elem_num) { 26 | void *data = queue_pop(queue); 27 | free_data(data); 28 | } 29 | } 30 | 31 | void queue_free(Queue *queue, void (*free_data)(void*)) { 32 | queue_empty(queue, free_data); 33 | free(queue); 34 | } 35 | 36 | void queue_push(Queue *queue, void *element) { 37 | queue_ele_t *item; 38 | 39 | if ((item = (queue_ele_t *) malloc(sizeof(queue_ele_t))) == 0) { 40 | puts("Not enough memory!"); 41 | return; 42 | } 43 | item->data = element; 44 | item->next = 0; 45 | 46 | /* Add to end of queue */ 47 | if (queue->first == 0) { 48 | queue->first = item; 49 | } else { 50 | queue->last->next = item; 51 | } 52 | queue->last = item; 53 | 54 | ++queue->elem_num; 55 | } 56 | 57 | void* queue_pop(Queue *queue) { 58 | queue_ele_t *item = 0; 59 | if (queue->elem_num == 0) 60 | return NULL; 61 | 62 | item = queue->first; 63 | 64 | queue->first = item->next; 65 | if (queue->last == item) 66 | queue->last = NULL; 67 | --queue->elem_num; 68 | 69 | void *ret = item->data; 70 | free(item); 71 | return ret; 72 | } 73 | 74 | void * queue_top(Queue *queue) { 75 | if (queue->elem_num == 0) 76 | return NULL; 77 | return queue->first->data; 78 | } 79 | 80 | int queue_size(Queue *queue) { 81 | return queue->elem_num; 82 | } 83 | 84 | void queue_foreach(Queue *queue, void (*func)(void *data, void *user_data), 85 | void *user_data) { 86 | queue_ele_t *item = 0; 87 | if (queue->elem_num == 0) 88 | return; 89 | item = queue->first; 90 | while (item) { 91 | func(item->data, user_data); 92 | item = item->next; 93 | } 94 | } 95 | 96 | /* 97 | * return the nth elem in queue. 98 | */ 99 | void* queue_get_n(Queue *queue, int n) { 100 | if (n >= queue_size(queue)) { 101 | return NULL; 102 | } 103 | int i = 0; 104 | queue_ele_t *item = queue->first; 105 | while (i < n) { 106 | item = item->next; 107 | ++i; 108 | } 109 | return item->data; 110 | 111 | } 112 | 113 | /* 114 | * Iterate the Queue to find an elem which meets the condition ('hit' returns 1). 115 | */ 116 | void* queue_find(Queue* queue, int (*hit)(void*, void*), void* data) { 117 | 118 | queue_ele_t *item = 0; 119 | if (queue->elem_num == 0) 120 | return NULL; 121 | 122 | item = queue->first; 123 | do { 124 | if (hit(item->data, data) == 1) 125 | break; 126 | } while ((item = item->next)); 127 | 128 | return item ? item->data : NULL; 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/read_phase.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "backup.h" 4 | 5 | static pthread_t read_t; 6 | 7 | static void read_file(sds path) { 8 | static unsigned char buf[DEFAULT_BLOCK_SIZE]; 9 | 10 | sds filename = sdsdup(path); 11 | 12 | if (jcr.path[sdslen(jcr.path) - 1] == '/') { 13 | /* the backup path points to a direcory */ 14 | sdsrange(filename, sdslen(jcr.path), -1); 15 | } else { 16 | /* the backup path points to a file */ 17 | int cur = sdslen(filename) - 1; 18 | while (filename[cur] != '/') 19 | cur--; 20 | sdsrange(filename, cur, -1); 21 | } 22 | 23 | FILE *fp; 24 | if ((fp = fopen(path, "r")) == NULL) { 25 | destor_log(DESTOR_WARNING, "Can not open file %s\n", path); 26 | perror("The reason is"); 27 | exit(1); 28 | } 29 | 30 | struct chunk *c = new_chunk(sdslen(filename) + 1); 31 | strcpy(c->data, filename); 32 | 33 | VERBOSE("Read phase: %s", filename); 34 | 35 | SET_CHUNK(c, CHUNK_FILE_START); 36 | 37 | sync_queue_push(read_queue, c); 38 | 39 | TIMER_DECLARE(1); 40 | TIMER_BEGIN(1); 41 | int size = 0; 42 | 43 | while ((size = fread(buf, 1, DEFAULT_BLOCK_SIZE, fp)) != 0) { 44 | TIMER_END(1, jcr.read_time); 45 | 46 | VERBOSE("Read phase: read %d bytes", size); 47 | 48 | c = new_chunk(size); 49 | memcpy(c->data, buf, size); 50 | 51 | sync_queue_push(read_queue, c); 52 | 53 | TIMER_BEGIN(1); 54 | } 55 | 56 | c = new_chunk(0); 57 | SET_CHUNK(c, CHUNK_FILE_END); 58 | sync_queue_push(read_queue, c); 59 | 60 | fclose(fp); 61 | 62 | sdsfree(filename); 63 | } 64 | 65 | static void find_one_file(sds path) { 66 | 67 | if (strcmp(path + sdslen(path) - 1, "/") == 0) { 68 | 69 | DIR *dir = opendir(path); 70 | struct dirent *entry; 71 | 72 | while ((entry = readdir(dir)) != 0) { 73 | /*ignore . and ..*/ 74 | if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..")) 75 | continue; 76 | sds newpath = sdsdup(path); 77 | newpath = sdscat(newpath, entry->d_name); 78 | 79 | struct stat state; 80 | if (stat(newpath, &state) != 0) { 81 | WARNING("The file %s does not exist! ignored!", newpath); 82 | return; 83 | } 84 | 85 | if (S_ISDIR(state.st_mode)) { 86 | assert(strcmp(newpath + sdslen(newpath) - 1, "/") != 0); 87 | newpath = sdscat(newpath, "/"); 88 | } 89 | 90 | find_one_file(newpath); 91 | 92 | sdsfree(newpath); 93 | } 94 | 95 | closedir(dir); 96 | } else { 97 | read_file(path); 98 | } 99 | } 100 | 101 | static void* read_thread(void *argv) { 102 | /* Each file will be processed separately */ 103 | find_one_file(jcr.path); 104 | sync_queue_term(read_queue); 105 | return NULL; 106 | } 107 | 108 | void start_read_phase() { 109 | /* running job */ 110 | jcr.status = JCR_STATUS_RUNNING; 111 | read_queue = sync_queue_new(10); 112 | pthread_create(&read_t, NULL, read_thread, NULL); 113 | } 114 | 115 | void stop_read_phase() { 116 | pthread_join(read_t, NULL); 117 | NOTICE("read phase stops successfully!"); 118 | } 119 | 120 | -------------------------------------------------------------------------------- /src/index/fingerprint_cache.c: -------------------------------------------------------------------------------- 1 | /* 2 | * fingerprint_cache.c 3 | * 4 | * Created on: Mar 24, 2014 5 | * Author: fumin 6 | */ 7 | #include "../destor.h" 8 | #include "index.h" 9 | #include "../storage/containerstore.h" 10 | #include "../recipe/recipestore.h" 11 | #include "../utils/lru_cache.h" 12 | 13 | static struct lruCache* lru_queue; 14 | 15 | /* defined in index.c */ 16 | extern struct { 17 | /* Requests to the key-value store */ 18 | int lookup_requests; 19 | int update_requests; 20 | int lookup_requests_for_unique; 21 | /* Overheads of prefetching module */ 22 | int read_prefetching_units; 23 | }index_overhead; 24 | 25 | void init_fingerprint_cache(){ 26 | switch(destor.index_category[1]){ 27 | case INDEX_CATEGORY_PHYSICAL_LOCALITY: 28 | lru_queue = new_lru_cache(destor.index_cache_size, 29 | free_container_meta, lookup_fingerprint_in_container_meta); 30 | break; 31 | case INDEX_CATEGORY_LOGICAL_LOCALITY: 32 | lru_queue = new_lru_cache(destor.index_cache_size, 33 | free_segment_recipe, lookup_fingerprint_in_segment_recipe); 34 | break; 35 | default: 36 | WARNING("Invalid index category!"); 37 | exit(1); 38 | } 39 | } 40 | 41 | int64_t fingerprint_cache_lookup(fingerprint *fp){ 42 | switch(destor.index_category[1]){ 43 | case INDEX_CATEGORY_PHYSICAL_LOCALITY:{ 44 | struct containerMeta* cm = lru_cache_lookup(lru_queue, fp); 45 | if (cm) 46 | return cm->id; 47 | break; 48 | } 49 | case INDEX_CATEGORY_LOGICAL_LOCALITY:{ 50 | struct segmentRecipe* sr = lru_cache_lookup(lru_queue, fp); 51 | if(sr){ 52 | struct chunkPointer* cp = g_hash_table_lookup(sr->kvpairs, fp); 53 | if(cp->id <= TEMPORARY_ID){ 54 | WARNING("expect > TEMPORARY_ID, but being %lld", cp->id); 55 | assert(cp->id > TEMPORARY_ID); 56 | } 57 | return cp->id; 58 | } 59 | break; 60 | } 61 | } 62 | 63 | return TEMPORARY_ID; 64 | } 65 | 66 | void fingerprint_cache_prefetch(int64_t id){ 67 | switch(destor.index_category[1]){ 68 | case INDEX_CATEGORY_PHYSICAL_LOCALITY:{ 69 | struct containerMeta * cm = retrieve_container_meta_by_id(id); 70 | index_overhead.read_prefetching_units++; 71 | if (cm) { 72 | lru_cache_insert(lru_queue, cm, NULL, NULL); 73 | } else{ 74 | WARNING("Error! The container %lld has not been written!", id); 75 | exit(1); 76 | } 77 | break; 78 | } 79 | case INDEX_CATEGORY_LOGICAL_LOCALITY:{ 80 | if (!lru_cache_hits(lru_queue, &id, 81 | segment_recipe_check_id)){ 82 | /* 83 | * If the segment we need is already in cache, 84 | * we do not need to read it. 85 | */ 86 | GQueue* segments = prefetch_segments(id, 87 | destor.index_segment_prefech); 88 | index_overhead.read_prefetching_units++; 89 | VERBOSE("Dedup phase: prefetch %d segments into %d cache", 90 | g_queue_get_length(segments), 91 | destor.index_cache_size); 92 | struct segmentRecipe* sr; 93 | while ((sr = g_queue_pop_tail(segments))) { 94 | /* From tail to head */ 95 | if (!lru_cache_hits(lru_queue, &sr->id, 96 | segment_recipe_check_id)) { 97 | lru_cache_insert(lru_queue, sr, NULL, NULL); 98 | } else { 99 | /* Already in cache */ 100 | free_segment_recipe(sr); 101 | } 102 | } 103 | g_queue_free(segments); 104 | } 105 | break; 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/do_delete.c: -------------------------------------------------------------------------------- 1 | /* 2 | * delete_server.c 3 | * 4 | * Created on: Jun 21, 2012 5 | * Author: fumin 6 | */ 7 | #include "destor.h" 8 | #include "storage/containerstore.h" 9 | #include "recipe/recipestore.h" 10 | #include "index/index.h" 11 | #include "cma.h" 12 | 13 | /* A simple wrap. 14 | * Just to make the interfaces of the index module more consistent. 15 | */ 16 | static inline void delete_an_entry(fingerprint *fp, int64_t *id){ 17 | index_delete(fp, *id); 18 | } 19 | 20 | /* 21 | * We assume a FIFO order of deleting backup, namely the oldest backup is deleted first. 22 | */ 23 | void do_delete(int jobid) { 24 | 25 | GHashTable *invalid_containers = trunc_manifest(jobid); 26 | 27 | init_index(); 28 | init_recipe_store(); 29 | 30 | /* Delete the invalid entries in the key-value store */ 31 | if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY){ 32 | init_container_store(); 33 | 34 | struct backupVersion* bv = open_backup_version(jobid); 35 | 36 | /* The entries pointing to Invalid Containers are invalid. */ 37 | GHashTableIter iter; 38 | gpointer key, value; 39 | g_hash_table_iter_init(&iter, invalid_containers); 40 | while(g_hash_table_iter_next(&iter, &key, &value)){ 41 | containerid id = *(containerid*)key; 42 | NOTICE("Reclaim container %lld", id); 43 | struct containerMeta* cm = retrieve_container_meta_by_id(id); 44 | 45 | container_meta_foreach(cm, delete_an_entry, &id); 46 | 47 | free_container_meta(cm); 48 | } 49 | 50 | bv->deleted = 1; 51 | update_backup_version(bv); 52 | free_backup_version(bv); 53 | 54 | close_container_store(); 55 | }else if(destor.index_category[1] == INDEX_CATEGORY_LOGICAL_LOCALITY){ 56 | /* Ideally, the entries pointing to segments in backup versions of a 'bv_num' less than 'jobid' are invalid. */ 57 | /* (For simplicity) Since a FIFO order is given, we only need to remove the IDs exactly matched 'bv_num'. */ 58 | struct backupVersion* bv = open_backup_version(jobid); 59 | 60 | struct segmentRecipe* sr; 61 | while((sr=read_next_segment(bv))){ 62 | segment_recipe_foreach(sr, delete_an_entry, &sr->id); 63 | } 64 | 65 | bv->deleted = 1; 66 | update_backup_version(bv); 67 | free_backup_version(bv); 68 | 69 | }else{ 70 | WARNING("Invalid index type"); 71 | exit(1); 72 | } 73 | 74 | close_recipe_store(); 75 | close_index(); 76 | 77 | char logfile[] = "delete.log"; 78 | FILE *fp = fopen(logfile, "a"); 79 | /* 80 | * ID of the job we delete, 81 | * number of live containers, 82 | * memory footprint 83 | */ 84 | fprintf(fp, "%d %d %d\n", 85 | jobid, 86 | destor.live_container_num, 87 | destor.index_memory_footprint); 88 | 89 | fclose(fp); 90 | 91 | /* record the IDs of invalid containers */ 92 | sds didfilepath = sdsdup(destor.working_directory); 93 | char s[128]; 94 | sprintf(s, "recipes/delete_%d.id", jobid); 95 | didfilepath = sdscat(didfilepath, s); 96 | 97 | FILE* didfile = fopen(didfilepath, "w"); 98 | if(didfile){ 99 | GHashTableIter iter; 100 | gpointer key, value; 101 | g_hash_table_iter_init(&iter, invalid_containers); 102 | while(g_hash_table_iter_next(&iter, &key, &value)){ 103 | containerid id = *(containerid*)key; 104 | fprintf(didfile, "%lld\n", id); 105 | } 106 | 107 | fclose(didfile); 108 | } 109 | 110 | 111 | g_hash_table_destroy(invalid_containers); 112 | } 113 | -------------------------------------------------------------------------------- /src/cap_rewrite.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "rewrite_phase.h" 4 | #include "backup.h" 5 | 6 | static int64_t chunk_num; 7 | 8 | static GHashTable *top; 9 | 10 | static void cap_segment_get_top() { 11 | 12 | /* Descending order */ 13 | g_sequence_sort(rewrite_buffer.container_record_seq, 14 | g_record_descmp_by_length, NULL); 15 | 16 | int length = g_sequence_get_length(rewrite_buffer.container_record_seq); 17 | int32_t num = length > destor.rewrite_capping_level ? 18 | destor.rewrite_capping_level : length, i; 19 | GSequenceIter *iter = g_sequence_get_begin_iter( 20 | rewrite_buffer.container_record_seq); 21 | for (i = 0; i < num; i++) { 22 | assert(!g_sequence_iter_is_end(iter)); 23 | struct containerRecord* record = g_sequence_get(iter); 24 | struct containerRecord* r = (struct containerRecord*) malloc( 25 | sizeof(struct containerRecord)); 26 | memcpy(r, record, sizeof(struct containerRecord)); 27 | r->out_of_order = 0; 28 | g_hash_table_insert(top, &r->cid, r); 29 | iter = g_sequence_iter_next(iter); 30 | } 31 | 32 | VERBOSE("Rewrite phase: Select Top-%d in %d containers", num, length); 33 | 34 | g_sequence_sort(rewrite_buffer.container_record_seq, g_record_cmp_by_id, NULL); 35 | } 36 | 37 | /* 38 | * We first assemble a fixed-sized buffer of pending chunks. 39 | * Then, counting container utilization in the buffer and sorting. 40 | * The pending chunks in containers of most low utilization are fragmentation. 41 | * The main drawback of capping, 42 | * is that capping overlook the relationship of consecutive buffers. 43 | */ 44 | void *cap_rewrite(void* arg) { 45 | top = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, free); 46 | 47 | while (1) { 48 | struct chunk *c = sync_queue_pop(dedup_queue); 49 | 50 | if (c == NULL) 51 | break; 52 | 53 | TIMER_DECLARE(1); 54 | TIMER_BEGIN(1); 55 | if (!rewrite_buffer_push(c)) { 56 | TIMER_END(1, jcr.rewrite_time); 57 | continue; 58 | } 59 | 60 | cap_segment_get_top(); 61 | 62 | while ((c = rewrite_buffer_pop())) { 63 | if (!CHECK_CHUNK(c, CHUNK_FILE_START) 64 | && !CHECK_CHUNK(c, CHUNK_FILE_END) 65 | && !CHECK_CHUNK(c, CHUNK_SEGMENT_START) 66 | && !CHECK_CHUNK(c, CHUNK_SEGMENT_END) 67 | && CHECK_CHUNK(c, CHUNK_DUPLICATE)) { 68 | if (g_hash_table_lookup(top, &c->id) == NULL) { 69 | /* not in TOP */ 70 | SET_CHUNK(c, CHUNK_OUT_OF_ORDER); 71 | VERBOSE("Rewrite phase: %lldth chunk is in out-of-order container %lld", 72 | chunk_num, c->id); 73 | } 74 | chunk_num++; 75 | } 76 | TIMER_END(1, jcr.rewrite_time); 77 | sync_queue_push(rewrite_queue, c); 78 | TIMER_BEGIN(1); 79 | } 80 | 81 | g_hash_table_remove_all(top); 82 | 83 | } 84 | 85 | cap_segment_get_top(); 86 | 87 | struct chunk *c; 88 | while ((c = rewrite_buffer_pop())) { 89 | if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END) 90 | && !CHECK_CHUNK(c, CHUNK_SEGMENT_START) && !CHECK_CHUNK(c, CHUNK_SEGMENT_END)) { 91 | if (g_hash_table_lookup(top, &c->id) == NULL) { 92 | /* not in TOP */ 93 | SET_CHUNK(c, CHUNK_OUT_OF_ORDER); 94 | VERBOSE("Rewrite phase: %lldth chunk is in out-of-order container %lld", 95 | chunk_num, c->id); 96 | } 97 | chunk_num++; 98 | } 99 | sync_queue_push(rewrite_queue, c); 100 | } 101 | 102 | g_hash_table_remove_all(top); 103 | 104 | sync_queue_term(rewrite_queue); 105 | 106 | return NULL; 107 | } 108 | -------------------------------------------------------------------------------- /src/utils/lru_cache.c: -------------------------------------------------------------------------------- 1 | /* 2 | * cache.c 3 | * 4 | * Created on: May 23, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "lru_cache.h" 12 | 13 | /* 14 | * The container read cache. 15 | */ 16 | struct lruCache* new_lru_cache(int size, void (*free_elem)(void *), 17 | int (*hit_elem)(void* elem, void* user_data)) { 18 | struct lruCache* c = (struct lruCache*) malloc(sizeof(struct lruCache)); 19 | 20 | c->elem_queue = NULL; 21 | 22 | c->max_size = size; 23 | c->size = 0; 24 | c->hit_count = 0; 25 | c->miss_count = 0; 26 | 27 | c->free_elem = free_elem; 28 | c->hit_elem = hit_elem; 29 | 30 | return c; 31 | } 32 | 33 | void free_lru_cache(struct lruCache* c) { 34 | g_list_free_full(c->elem_queue, c->free_elem); 35 | free(c); 36 | } 37 | 38 | /* find a item in cache matching the condition */ 39 | void* lru_cache_lookup(struct lruCache* c, void* user_data) { 40 | GList* elem = g_list_first(c->elem_queue); 41 | while (elem) { 42 | if (c->hit_elem(elem->data, user_data)) 43 | break; 44 | elem = g_list_next(elem); 45 | } 46 | if (elem) { 47 | c->elem_queue = g_list_remove_link(c->elem_queue, elem); 48 | c->elem_queue = g_list_concat(elem, c->elem_queue); 49 | c->hit_count++; 50 | return elem->data; 51 | } else { 52 | c->miss_count++; 53 | return NULL; 54 | } 55 | } 56 | 57 | void* lru_cache_lookup_without_update(struct lruCache* c, void* user_data) { 58 | GList* elem = g_list_first(c->elem_queue); 59 | while (elem) { 60 | if (c->hit_elem(elem->data, user_data)) 61 | break; 62 | elem = g_list_next(elem); 63 | } 64 | if (elem) { 65 | return elem->data; 66 | } else { 67 | return NULL; 68 | } 69 | } 70 | /* 71 | * Hit an existing elem for simulating an insertion of it. 72 | */ 73 | void* lru_cache_hits(struct lruCache* c, void* user_data, 74 | int (*hit)(void* elem, void* user_data)) { 75 | GList* elem = g_list_first(c->elem_queue); 76 | while (elem) { 77 | if (hit(elem->data, user_data)) 78 | break; 79 | elem = g_list_next(elem); 80 | } 81 | if (elem) { 82 | c->elem_queue = g_list_remove_link(c->elem_queue, elem); 83 | c->elem_queue = g_list_concat(elem, c->elem_queue); 84 | return elem->data; 85 | } else { 86 | return NULL; 87 | } 88 | } 89 | 90 | /* 91 | * We know that the data does not exist! 92 | */ 93 | void lru_cache_insert(struct lruCache *c, void* data, 94 | void (*func)(void*, void*), void* user_data) { 95 | void *victim = 0; 96 | if (c->max_size > 0 && c->size == c->max_size) { 97 | GList *last = g_list_last(c->elem_queue); 98 | c->elem_queue = g_list_remove_link(c->elem_queue, last); 99 | victim = last->data; 100 | g_list_free_1(last); 101 | c->size--; 102 | } 103 | 104 | c->elem_queue = g_list_prepend(c->elem_queue, data); 105 | c->size++; 106 | if (victim) { 107 | if (func) 108 | func(victim, user_data); 109 | c->free_elem(victim); 110 | } 111 | } 112 | 113 | /* kick out the first elem satisfying func */ 114 | void lru_cache_kicks(struct lruCache* c, void* user_data, 115 | int (*func)(void* elem, void* user_data)) { 116 | GList* elem = g_list_last(c->elem_queue); 117 | while (elem) { 118 | if (func(elem->data, user_data)) 119 | break; 120 | elem = g_list_previous(elem); 121 | } 122 | if (elem) { 123 | c->elem_queue = g_list_remove_link(c->elem_queue, elem); 124 | c->free_elem(elem->data); 125 | g_list_free_1(elem); 126 | c->size--; 127 | } 128 | } 129 | 130 | int lru_cache_is_full(struct lruCache* c) { 131 | if (c->max_size < 0) 132 | return 0; 133 | return c->size >= c->max_size ? 1 : 0; 134 | } 135 | -------------------------------------------------------------------------------- /src/utils/sync_queue.c: -------------------------------------------------------------------------------- 1 | #include "sync_queue.h" 2 | #include 3 | 4 | SyncQueue* sync_queue_new(int size) { 5 | SyncQueue *s_queue = (SyncQueue*) malloc(sizeof(SyncQueue)); 6 | s_queue->queue = queue_new(); 7 | s_queue->max_size = size; 8 | s_queue->term = 0; 9 | 10 | if (pthread_mutex_init(&s_queue->mutex, 0) 11 | || pthread_cond_init(&s_queue->max_work, 0) 12 | || pthread_cond_init(&s_queue->min_work, 0)) { 13 | puts("Failed to init mutex or work in SyncQueue!"); 14 | return NULL; 15 | } 16 | return s_queue; 17 | } 18 | 19 | void sync_queue_free(SyncQueue* s_queue, void (*free_data)(void*)) { 20 | queue_free(s_queue->queue, free_data); 21 | pthread_mutex_destroy(&s_queue->mutex); 22 | pthread_cond_destroy(&s_queue->max_work); 23 | pthread_cond_destroy(&s_queue->min_work); 24 | free(s_queue); 25 | } 26 | 27 | void sync_queue_push(SyncQueue* s_queue, void* item) { 28 | if (pthread_mutex_lock(&s_queue->mutex) != 0) { 29 | puts("failed to lock!"); 30 | return; 31 | } 32 | 33 | if (s_queue->term == 1) { 34 | pthread_mutex_unlock(&s_queue->mutex); 35 | return; 36 | } 37 | 38 | while (s_queue->max_size > 0 39 | && queue_size(s_queue->queue) >= s_queue->max_size) { 40 | pthread_cond_wait(&s_queue->max_work, &s_queue->mutex); 41 | } 42 | 43 | queue_push(s_queue->queue, item); 44 | 45 | pthread_cond_broadcast(&s_queue->min_work); 46 | 47 | if (pthread_mutex_unlock(&s_queue->mutex)) { 48 | puts("failed to lock!"); 49 | return; 50 | } 51 | } 52 | 53 | /* 54 | * Return NULL if the queue is terminated. 55 | */ 56 | void* sync_queue_pop(SyncQueue* s_queue) { 57 | if (pthread_mutex_lock(&s_queue->mutex) != 0) { 58 | puts("failed to lock!"); 59 | return NULL; 60 | } 61 | 62 | while (queue_size(s_queue->queue) == 0) { 63 | if (s_queue->term == 1) { 64 | pthread_mutex_unlock(&s_queue->mutex); 65 | return NULL; 66 | } 67 | pthread_cond_wait(&s_queue->min_work, &s_queue->mutex); 68 | } 69 | 70 | void * item = queue_pop(s_queue->queue); 71 | pthread_cond_broadcast(&s_queue->max_work); 72 | 73 | pthread_mutex_unlock(&s_queue->mutex); 74 | return item; 75 | } 76 | 77 | int sync_queue_size(SyncQueue* s_queue) { 78 | return queue_size(s_queue->queue); 79 | } 80 | 81 | void sync_queue_term(SyncQueue* s_queue) { 82 | if (pthread_mutex_lock(&s_queue->mutex) != 0) { 83 | puts("failed to lock!"); 84 | return; 85 | } 86 | 87 | s_queue->term = 1; 88 | 89 | pthread_cond_broadcast(&s_queue->min_work); 90 | 91 | pthread_mutex_unlock(&s_queue->mutex); 92 | } 93 | 94 | /* 95 | * Iterate the Queue to find an elem which meets the condition ('hit' returns 1). 96 | */ 97 | void* sync_queue_find(SyncQueue* s_queue, int (*hit)(void*, void*), void* data, 98 | void* (*dup)(void*)) { 99 | void* ret = NULL; 100 | 101 | if (pthread_mutex_lock(&s_queue->mutex) != 0) { 102 | puts("failed to lock!"); 103 | return NULL; 104 | } 105 | 106 | ret = queue_find(s_queue->queue, hit, data); 107 | 108 | if (ret && dup) { 109 | /* Create a copy */ 110 | ret = dup(ret); 111 | } 112 | 113 | pthread_mutex_unlock(&s_queue->mutex); 114 | 115 | return ret; 116 | } 117 | 118 | void* sync_queue_get_top(SyncQueue* s_queue) { 119 | if (pthread_mutex_lock(&s_queue->mutex) != 0) { 120 | puts("failed to lock!"); 121 | return NULL; 122 | } 123 | 124 | while (queue_size(s_queue->queue) == 0) { 125 | if (s_queue->term == 1) { 126 | pthread_mutex_unlock(&s_queue->mutex); 127 | return NULL; 128 | } 129 | pthread_cond_wait(&s_queue->min_work, &s_queue->mutex); 130 | } 131 | 132 | void * item = queue_top(s_queue->queue); 133 | 134 | pthread_mutex_unlock(&s_queue->mutex); 135 | return item; 136 | } 137 | -------------------------------------------------------------------------------- /src/index/segmenting_method.c: -------------------------------------------------------------------------------- 1 | #include "../destor.h" 2 | /* 3 | * c == NULL indicates the end and return the segment. 4 | * If a segment boundary is found, return the segment; 5 | * else return NULL. 6 | */ 7 | struct segment* (*segmenting)(struct chunk *c); 8 | 9 | /* 10 | * Used by SiLo and Block Locality Caching. 11 | */ 12 | static struct segment* segment_fixed(struct chunk * c) { 13 | static struct segment* tmp; 14 | if (tmp == NULL) 15 | tmp = new_segment(); 16 | 17 | if (c == NULL) 18 | /* The end of stream */ 19 | return tmp; 20 | 21 | g_sequence_append(tmp->chunks, c); 22 | if (CHECK_CHUNK(c, CHUNK_FILE_START) 23 | || CHECK_CHUNK(c, CHUNK_FILE_END)) 24 | /* FILE_END */ 25 | return NULL; 26 | 27 | /* a normal chunk */ 28 | tmp->chunk_num++; 29 | 30 | if (tmp->chunk_num == destor.index_segment_algorithm[1]) { 31 | /* segment boundary */ 32 | struct segment* ret = tmp; 33 | tmp = NULL; 34 | return ret; 35 | } 36 | 37 | return NULL; 38 | } 39 | 40 | /* 41 | * Used by Extreme Binning. 42 | */ 43 | static struct segment* segment_file_defined(struct chunk *c) { 44 | static struct segment* tmp; 45 | /* 46 | * For file-defined segmenting, 47 | * the end is not a new segment. 48 | */ 49 | if (tmp == NULL) 50 | tmp = new_segment(); 51 | 52 | if (c == NULL) 53 | return tmp; 54 | 55 | g_sequence_append(tmp->chunks, c); 56 | if (CHECK_CHUNK(c, CHUNK_FILE_END)) { 57 | struct segment* ret = tmp; 58 | tmp = NULL; 59 | return ret; 60 | } else if (CHECK_CHUNK(c, CHUNK_FILE_START)) { 61 | return NULL; 62 | } else { 63 | /* a normal chunk */ 64 | tmp->chunk_num++; 65 | return NULL; 66 | } 67 | } 68 | 69 | /* 70 | * Used by Sparse Index. 71 | */ 72 | static struct segment* segment_content_defined(struct chunk *c) { 73 | static struct segment* tmp; 74 | 75 | if (tmp == NULL) 76 | tmp = new_segment(); 77 | 78 | if (c == NULL) 79 | /* The end of stream */ 80 | return tmp; 81 | 82 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) { 83 | g_sequence_append(tmp->chunks, c); 84 | return NULL; 85 | } 86 | 87 | /* Avoid too small segment. */ 88 | if (tmp->chunk_num < destor.index_segment_min) { 89 | g_sequence_append(tmp->chunks, c); 90 | tmp->chunk_num++; 91 | return NULL; 92 | } 93 | 94 | int *head = (int*)&c->fp[16]; 95 | if ((*head) % destor.index_segment_algorithm[1] == 0) { 96 | struct segment* ret = tmp; 97 | tmp = new_segment(); 98 | g_sequence_append(tmp->chunks, c); 99 | tmp->chunk_num++; 100 | return ret; 101 | } 102 | 103 | g_sequence_append(tmp->chunks, c); 104 | tmp->chunk_num++; 105 | if (tmp->chunk_num >= destor.index_segment_max){ 106 | struct segment* ret = tmp; 107 | tmp = new_segment(); 108 | return ret; 109 | } 110 | 111 | return NULL; 112 | } 113 | 114 | void init_segmenting_method(){ 115 | switch (destor.index_segment_algorithm[0]) { 116 | case INDEX_SEGMENT_FIXED: 117 | segmenting = segment_fixed; 118 | break; 119 | case INDEX_SEGMENT_CONTENT_DEFINED: 120 | segmenting = segment_content_defined; 121 | break; 122 | case INDEX_SEGMENT_FILE_DEFINED: 123 | segmenting = segment_file_defined; 124 | break; 125 | default: 126 | fprintf(stderr, "Invalid segment algorithm!\n"); 127 | exit(1); 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/recipe/recipestore.h: -------------------------------------------------------------------------------- 1 | /* 2 | * recipestore.h 3 | * 4 | * Created on: May 22, 2012 5 | * Author: fumin 6 | */ 7 | 8 | #ifndef RECIPESTORE_H_ 9 | #define RECIPESTORE_H_ 10 | 11 | #include "../destor.h" 12 | 13 | /* 14 | * A backup version 15 | * A backup version describes the fingerprint sequence of a backup job to facilitate restore jobs. 16 | * It consists of three metadata files: .meta, .recipe, and .record. 17 | * The .record file is for the optimal cache: it consists of access records of referred containers. 18 | * The .recipe file records the fingerprint sequence of the backup with segment boundary indicators. 19 | * Hence, the .recipe file consists of segment recipes (each of which describes the 20 | * fingerprint sequence of a segment). 21 | * The .meta file consists of metadata of file recipes, i.e., fileRecipeMeta. 22 | * Each fileRecipeMeta structure describes the range of a file recipe in the .recipe file. 23 | * So, we can accurately restore a file. 24 | * */ 25 | struct backupVersion { 26 | 27 | sds path; 28 | int32_t bv_num; /* backup version number start from 0 */ 29 | 30 | int deleted; 31 | 32 | int64_t number_of_files; 33 | int64_t number_of_chunks; 34 | 35 | sds fname_prefix; /* The prefix of the file names */ 36 | 37 | FILE *metadata_fp; 38 | FILE *recipe_fp; 39 | FILE *record_fp; 40 | 41 | /* the write buffer of recipe meta */ 42 | char *metabuf; 43 | int metabufoff; 44 | 45 | /* the write buffer of records */ 46 | char *recordbuf; 47 | int recordbufoff; 48 | 49 | char* segmentbuf; 50 | int segmentlen; 51 | int segmentbufoff; 52 | }; 53 | 54 | /* Point to the meta of a file recipe */ 55 | struct fileRecipeMeta { 56 | int64_t chunknum; 57 | int64_t filesize; 58 | sds filename; 59 | }; 60 | 61 | /* 62 | * Each recipe consists of segments. 63 | * Each prefetched segment is organized as a hash table for optimizing lookup. 64 | * It is the basic unit of logical locality. 65 | * */ 66 | struct segmentRecipe { 67 | segmentid id; 68 | /* Map fingerprints in the segment to their container IDs.*/ 69 | GHashTable *kvpairs; 70 | }; 71 | 72 | /* 73 | * If id == CHUNK_SEGMENT_START or CHUNK_SEGMENT_END, 74 | * it is a flag of segment boundary. 75 | * If id == CHUNK_SEGMENT_START, 76 | * size indicates the length of the segment in terms of # of chunks. 77 | */ 78 | struct chunkPointer { 79 | fingerprint fp; 80 | containerid id; 81 | int32_t size; 82 | }; 83 | 84 | void init_recipe_store(); 85 | void close_recipe_store(); 86 | 87 | struct backupVersion* create_backup_version(const char *path); 88 | int backup_version_exists(int number); 89 | struct backupVersion* open_backup_version(int number); 90 | void update_backup_version(struct backupVersion *b); 91 | void free_backup_version(struct backupVersion *b); 92 | 93 | void append_file_recipe_meta(struct backupVersion* b, struct fileRecipeMeta* r); 94 | void append_n_chunk_pointers(struct backupVersion* b, 95 | struct chunkPointer* cp, int n); 96 | struct fileRecipeMeta* read_next_file_recipe_meta(struct backupVersion* b); 97 | struct chunkPointer* read_next_n_chunk_pointers(struct backupVersion* b, int n, 98 | int *k); 99 | containerid* read_next_n_records(struct backupVersion* b, int n, int *k); 100 | struct fileRecipeMeta* new_file_recipe_meta(char* name); 101 | void free_file_recipe_meta(struct fileRecipeMeta* r); 102 | 103 | int segment_recipe_check_id(struct segmentRecipe* sr, segmentid *id); 104 | struct segmentRecipe* new_segment_recipe(); 105 | void free_segment_recipe(struct segmentRecipe* sr); 106 | segmentid append_segment_flag(struct backupVersion* b, int flag, int segment_size); 107 | GQueue* prefetch_segments(segmentid id, int prefetch_num); 108 | int lookup_fingerprint_in_segment_recipe(struct segmentRecipe* sr, 109 | fingerprint *fp); 110 | 111 | struct segmentRecipe* read_next_segment(struct backupVersion *bv); 112 | 113 | #endif /* RECIPESTORE_H_ */ 114 | -------------------------------------------------------------------------------- /src/utils/sds.h: -------------------------------------------------------------------------------- 1 | /* SDSLib, A C dynamic strings library 2 | * 3 | * Copyright (c) 2006-2010, Salvatore Sanfilippo 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * * Redistributions of source code must retain the above copyright notice, 10 | * this list of conditions and the following disclaimer. 11 | * * Redistributions in binary form must reproduce the above copyright 12 | * notice, this list of conditions and the following disclaimer in the 13 | * documentation and/or other materials provided with the distribution. 14 | * * Neither the name of Redis nor the names of its contributors may be used 15 | * to endorse or promote products derived from this software without 16 | * specific prior written permission. 17 | * 18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 22 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 | * POSSIBILITY OF SUCH DAMAGE. 29 | */ 30 | 31 | #ifndef __SDS_H 32 | #define __SDS_H 33 | 34 | #define SDS_MAX_PREALLOC (1024*1024) 35 | 36 | #include 37 | #include 38 | 39 | typedef char *sds; 40 | 41 | struct sdshdr { 42 | int len; 43 | int free; 44 | char buf[]; 45 | }; 46 | 47 | static inline size_t sdslen(const sds s) { 48 | struct sdshdr *sh = (void*)(s-(sizeof(struct sdshdr))); 49 | return sh->len; 50 | } 51 | 52 | static inline size_t sdsavail(const sds s) { 53 | struct sdshdr *sh = (void*)(s-(sizeof(struct sdshdr))); 54 | return sh->free; 55 | } 56 | 57 | sds sdsnewlen(const void *init, size_t initlen); 58 | sds sdsnew(const char *init); 59 | sds sdsempty(void); 60 | size_t sdslen(const sds s); 61 | sds sdsdup(const sds s); 62 | void sdsfree(sds s); 63 | size_t sdsavail(const sds s); 64 | sds sdsgrowzero(sds s, size_t len); 65 | sds sdscatlen(sds s, const void *t, size_t len); 66 | sds sdscat(sds s, const char *t); 67 | sds sdscatsds(sds s, const sds t); 68 | sds sdscpylen(sds s, const char *t, size_t len); 69 | sds sdscpy(sds s, const char *t); 70 | 71 | sds sdscatvprintf(sds s, const char *fmt, va_list ap); 72 | #ifdef __GNUC__ 73 | sds sdscatprintf(sds s, const char *fmt, ...) 74 | __attribute__((format(printf, 2, 3))); 75 | #else 76 | sds sdscatprintf(sds s, const char *fmt, ...); 77 | #endif 78 | 79 | sds sdstrim(sds s, const char *cset); 80 | void sdsrange(sds s, int start, int end); 81 | void sdsupdatelen(sds s); 82 | void sdsclear(sds s); 83 | int sdscmp(const sds s1, const sds s2); 84 | sds *sdssplitlen(const char *s, int len, const char *sep, int seplen, int *count); 85 | void sdsfreesplitres(sds *tokens, int count); 86 | void sdstolower(sds s); 87 | void sdstoupper(sds s); 88 | sds sdsfromlonglong(long long value); 89 | sds sdscatrepr(sds s, const char *p, size_t len); 90 | sds *sdssplitargs(const char *line, int *argc); 91 | sds sdsmapchars(sds s, const char *from, const char *to, size_t setlen); 92 | sds sdsjoin(char **argv, int argc, char *sep); 93 | 94 | /* Low level functions exposed to the user API */ 95 | sds sdsMakeRoomFor(sds s, size_t addlen); 96 | void sdsIncrLen(sds s, int incr); 97 | sds sdsRemoveFreeSpace(sds s); 98 | size_t sdsAllocSize(sds s); 99 | 100 | #endif 101 | -------------------------------------------------------------------------------- /src/dedup_phase.c: -------------------------------------------------------------------------------- 1 | /* 2 | * In the phase, 3 | * we aggregate chunks into segments, 4 | * and deduplicate each segment with its similar segments. 5 | * Duplicate chunks are identified and marked. 6 | * For fingerprint indexes exploiting physical locality (e.g., DDFS, Sampled Index), 7 | * segments are only for batch process. 8 | * */ 9 | #include "destor.h" 10 | #include "jcr.h" 11 | #include "index/index.h" 12 | #include "backup.h" 13 | #include "storage/containerstore.h" 14 | 15 | static pthread_t dedup_t; 16 | static int64_t chunk_num; 17 | static int64_t segment_num; 18 | 19 | struct { 20 | /* g_mutex_init() is unnecessary if in static storage. */ 21 | pthread_mutex_t mutex; 22 | pthread_cond_t cond; // index buffer is not full 23 | // index buffer is full, waiting 24 | // if threshold < 0, it indicates no threshold. 25 | int wait_threshold; 26 | } index_lock; 27 | 28 | void send_segment(struct segment* s) { 29 | /* 30 | * CHUNK_SEGMENT_START and _END are used for 31 | * reconstructing the segment in filter phase. 32 | */ 33 | struct chunk* ss = new_chunk(0); 34 | SET_CHUNK(ss, CHUNK_SEGMENT_START); 35 | sync_queue_push(dedup_queue, ss); 36 | 37 | GSequenceIter *end = g_sequence_get_end_iter(s->chunks); 38 | GSequenceIter *begin = g_sequence_get_begin_iter(s->chunks); 39 | while(begin != end) { 40 | struct chunk* c = g_sequence_get(begin); 41 | if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END)) { 42 | if (CHECK_CHUNK(c, CHUNK_DUPLICATE)) { 43 | if (c->id == TEMPORARY_ID) { 44 | DEBUG("Dedup phase: %ldth chunk is identical to a unique chunk", 45 | chunk_num++); 46 | } else { 47 | DEBUG("Dedup phase: %ldth chunk is duplicate in container %lld", 48 | chunk_num++, c->id); 49 | } 50 | } else { 51 | DEBUG("Dedup phase: %ldth chunk is unique", chunk_num++); 52 | } 53 | 54 | } 55 | sync_queue_push(dedup_queue, c); 56 | g_sequence_remove(begin); 57 | begin = g_sequence_get_begin_iter(s->chunks); 58 | } 59 | 60 | struct chunk* se = new_chunk(0); 61 | SET_CHUNK(se, CHUNK_SEGMENT_END); 62 | sync_queue_push(dedup_queue, se); 63 | 64 | s->chunk_num = 0; 65 | 66 | } 67 | 68 | void *dedup_thread(void *arg) { 69 | struct segment* s = NULL; 70 | while (1) { 71 | struct chunk *c = NULL; 72 | if (destor.simulation_level != SIMULATION_ALL) 73 | c = sync_queue_pop(hash_queue); 74 | else 75 | c = sync_queue_pop(trace_queue); 76 | 77 | /* Add the chunk to the segment. */ 78 | s = segmenting(c); 79 | if (!s) 80 | continue; 81 | /* segmenting success */ 82 | if (s->chunk_num > 0) { 83 | VERBOSE("Dedup phase: the %lldth segment of %lld chunks", segment_num++, 84 | s->chunk_num); 85 | /* Each duplicate chunk will be marked. */ 86 | pthread_mutex_lock(&index_lock.mutex); 87 | while (index_lookup(s) == 0) { 88 | pthread_cond_wait(&index_lock.cond, &index_lock.mutex); 89 | } 90 | pthread_mutex_unlock(&index_lock.mutex); 91 | } else { 92 | VERBOSE("Dedup phase: an empty segment"); 93 | } 94 | /* Send chunks in the segment to the next phase. 95 | * The segment will be cleared. */ 96 | send_segment(s); 97 | 98 | free_segment(s); 99 | s = NULL; 100 | 101 | if (c == NULL) 102 | break; 103 | } 104 | 105 | sync_queue_term(dedup_queue); 106 | 107 | return NULL; 108 | } 109 | 110 | void start_dedup_phase() { 111 | 112 | if(destor.index_segment_algorithm[0] == INDEX_SEGMENT_CONTENT_DEFINED) 113 | index_lock.wait_threshold = destor.rewrite_algorithm[1] + destor.index_segment_max - 1; 114 | else if(destor.index_segment_algorithm[0] == INDEX_SEGMENT_FIXED) 115 | index_lock.wait_threshold = destor.rewrite_algorithm[1] + destor.index_segment_algorithm[1] - 1; 116 | else 117 | index_lock.wait_threshold = -1; // file-defined segmenting has no threshold. 118 | 119 | pthread_mutex_init(&index_lock.mutex, NULL); 120 | pthread_cond_init(&index_lock.cond, NULL); 121 | 122 | dedup_queue = sync_queue_new(1000); 123 | 124 | pthread_create(&dedup_t, NULL, dedup_thread, NULL); 125 | } 126 | 127 | void stop_dedup_phase() { 128 | pthread_join(dedup_t, NULL); 129 | NOTICE("dedup phase stops successfully: %d segments of %d chunks on average", 130 | segment_num, segment_num ? chunk_num / segment_num : 0); 131 | } 132 | -------------------------------------------------------------------------------- /src/rewrite_phase.c: -------------------------------------------------------------------------------- 1 | /* 2 | * In the phase, 3 | * we mark the chunks required to be rewriting. 4 | */ 5 | #include "destor.h" 6 | #include "jcr.h" 7 | #include "rewrite_phase.h" 8 | #include "backup.h" 9 | 10 | static pthread_t rewrite_t; 11 | 12 | /* Descending order */ 13 | gint g_record_descmp_by_length(struct containerRecord* a, 14 | struct containerRecord* b, gpointer user_data) { 15 | return b->size - a->size; 16 | } 17 | 18 | gint g_record_cmp_by_id(struct containerRecord* a, struct containerRecord* b, 19 | gpointer user_data) { 20 | return a->cid - b->cid; 21 | } 22 | 23 | static void init_rewrite_buffer() { 24 | rewrite_buffer.chunk_queue = g_queue_new(); 25 | rewrite_buffer.container_record_seq = g_sequence_new(free); 26 | rewrite_buffer.num = 0; 27 | rewrite_buffer.size = 0; 28 | } 29 | 30 | /* 31 | * return 1 if buffer is full; 32 | * return 0 if buffer is not full. 33 | */ 34 | int rewrite_buffer_push(struct chunk* c) { 35 | g_queue_push_tail(rewrite_buffer.chunk_queue, c); 36 | 37 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END) 38 | || CHECK_CHUNK(c, CHUNK_SEGMENT_START) || CHECK_CHUNK(c, CHUNK_SEGMENT_END)) 39 | return 0; 40 | 41 | if (c->id != TEMPORARY_ID) { 42 | assert(CHECK_CHUNK(c, CHUNK_DUPLICATE)); 43 | struct containerRecord tmp_record; 44 | tmp_record.cid = c->id; 45 | GSequenceIter *iter = g_sequence_lookup( 46 | rewrite_buffer.container_record_seq, &tmp_record, 47 | g_record_cmp_by_id, 48 | NULL); 49 | if (iter == NULL) { 50 | struct containerRecord* record = malloc( 51 | sizeof(struct containerRecord)); 52 | record->cid = c->id; 53 | record->size = c->size; 54 | /* We first assume it is out-of-order */ 55 | record->out_of_order = 1; 56 | g_sequence_insert_sorted(rewrite_buffer.container_record_seq, 57 | record, g_record_cmp_by_id, NULL); 58 | } else { 59 | struct containerRecord* record = g_sequence_get(iter); 60 | assert(record->cid == c->id); 61 | record->size += c->size; 62 | } 63 | } 64 | 65 | rewrite_buffer.num++; 66 | rewrite_buffer.size += c->size; 67 | 68 | if (rewrite_buffer.num >= destor.rewrite_algorithm[1]) { 69 | assert(rewrite_buffer.num == destor.rewrite_algorithm[1]); 70 | return 1; 71 | } 72 | 73 | return 0; 74 | } 75 | 76 | struct chunk* rewrite_buffer_top() { 77 | return g_queue_peek_head(rewrite_buffer.chunk_queue); 78 | } 79 | 80 | struct chunk* rewrite_buffer_pop() { 81 | struct chunk* c = g_queue_pop_head(rewrite_buffer.chunk_queue); 82 | 83 | if (c && !CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END) 84 | && !CHECK_CHUNK(c, CHUNK_SEGMENT_START) && !CHECK_CHUNK(c, CHUNK_SEGMENT_END)) { 85 | /* A normal chunk */ 86 | if (CHECK_CHUNK(c, CHUNK_DUPLICATE) && c->id != TEMPORARY_ID) { 87 | GSequenceIter *iter = g_sequence_lookup( 88 | rewrite_buffer.container_record_seq, &c->id, 89 | g_record_cmp_by_id, NULL); 90 | assert(iter); 91 | struct containerRecord* record = g_sequence_get(iter); 92 | record->size -= c->size; 93 | if (record->size == 0) 94 | g_sequence_remove(iter); 95 | 96 | /* History-Aware Rewriting */ 97 | if (destor.rewrite_enable_har && CHECK_CHUNK(c, CHUNK_DUPLICATE)) 98 | har_check(c); 99 | } 100 | rewrite_buffer.num--; 101 | rewrite_buffer.size -= c->size; 102 | } 103 | 104 | return c; 105 | } 106 | 107 | /* 108 | * If rewrite is disable. 109 | */ 110 | static void* no_rewrite(void* arg) { 111 | while (1) { 112 | struct chunk* c = sync_queue_pop(dedup_queue); 113 | 114 | if (c == NULL) 115 | break; 116 | 117 | sync_queue_push(rewrite_queue, c); 118 | 119 | /* History-Aware Rewriting */ 120 | if (destor.rewrite_enable_har && CHECK_CHUNK(c, CHUNK_DUPLICATE)) 121 | har_check(c); 122 | } 123 | 124 | sync_queue_term(rewrite_queue); 125 | 126 | return NULL; 127 | } 128 | 129 | void start_rewrite_phase() { 130 | rewrite_queue = sync_queue_new(1000); 131 | 132 | init_rewrite_buffer(); 133 | 134 | init_har(); 135 | 136 | if (destor.rewrite_algorithm[0] == REWRITE_NO) { 137 | pthread_create(&rewrite_t, NULL, no_rewrite, NULL); 138 | } else if (destor.rewrite_algorithm[0] 139 | == REWRITE_CFL_SELECTIVE_DEDUPLICATION) { 140 | pthread_create(&rewrite_t, NULL, cfl_rewrite, NULL); 141 | } else if (destor.rewrite_algorithm[0] == REWRITE_CONTEXT_BASED) { 142 | pthread_create(&rewrite_t, NULL, cbr_rewrite, NULL); 143 | } else if (destor.rewrite_algorithm[0] == REWRITE_CAPPING) { 144 | pthread_create(&rewrite_t, NULL, cap_rewrite, NULL); 145 | } else { 146 | fprintf(stderr, "Invalid rewrite algorithm\n"); 147 | exit(1); 148 | } 149 | 150 | } 151 | 152 | void stop_rewrite_phase() { 153 | pthread_join(rewrite_t, NULL); 154 | NOTICE("rewrite phase stops successfully!"); 155 | } 156 | -------------------------------------------------------------------------------- /src/cma.c: -------------------------------------------------------------------------------- 1 | /* 2 | * The Container-Marker Algorithm. 3 | * After each backup, we read the original manifest, and update the backup times. 4 | * In each deletion operation, 5 | * the containers with a time smaller than the time of deleted backup are reclaimed. 6 | */ 7 | 8 | #include "cma.h" 9 | #include "storage/containerstore.h" 10 | #include "jcr.h" 11 | 12 | struct record{ 13 | containerid id; 14 | int time; 15 | }; 16 | 17 | void update_manifest(GHashTable *monitor){ 18 | 19 | GHashTable *manifest = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, free); 20 | 21 | sds fname = sdsdup(destor.working_directory); 22 | fname = sdscat(fname, "/manifest"); 23 | FILE *fp = NULL; 24 | if((fp = fopen(fname, "r"))){ 25 | /* file exists. Reconstruct the manifest from the file. */ 26 | struct record tmp; 27 | while(fscanf(fp, "%lld,%d", &tmp.id, &tmp.time) == 2){ 28 | struct record* rec = (struct record*) malloc(sizeof(struct record)); 29 | rec->id = tmp.id; 30 | rec->time = tmp.time; 31 | g_hash_table_insert(manifest, &rec->id, rec); 32 | } 33 | 34 | NOTICE("CMA: read %d records.", g_hash_table_size(manifest)); 35 | 36 | fclose(fp); 37 | } 38 | 39 | /* Update the backup times in the manifest. */ 40 | GHashTableIter iter; 41 | gpointer key, value; 42 | g_hash_table_iter_init(&iter, monitor); 43 | while(g_hash_table_iter_next(&iter, &key, &value)){ 44 | /* the key is a pointer to a container ID. */ 45 | struct record *r = g_hash_table_lookup(manifest, key); 46 | if(!r){ 47 | r = (struct record*) malloc(sizeof(struct record)); 48 | r->id = *(containerid*)key; 49 | g_hash_table_insert(manifest, &r->id, r); 50 | } 51 | r->time = jcr.id; 52 | } 53 | 54 | /* Flush the manifest */ 55 | if((fp = fopen(fname, "w"))){ 56 | /* Update the manifest into the file. */ 57 | g_hash_table_iter_init(&iter, manifest); 58 | while(g_hash_table_iter_next(&iter, &key, &value)){ 59 | struct record* r = value; 60 | fprintf(fp, "%lld,%d\n", r->id, r->time); 61 | } 62 | 63 | NOTICE("CMA: update %d records.", g_hash_table_size(manifest)); 64 | fclose(fp); 65 | }else{ 66 | WARNING("Cannot create the manifest!"); 67 | exit(1); 68 | } 69 | 70 | destor.live_container_num = g_hash_table_size(manifest); 71 | 72 | g_hash_table_destroy(manifest); 73 | 74 | sdsfree(fname); 75 | } 76 | 77 | /* 78 | * Be called when users delete backups in FIFO order. 79 | * Delete all backups earlier than jobid. 80 | * All container IDs with a time smaller than or equal to jobid can be removed. 81 | * Return these IDs. 82 | */ 83 | GHashTable* trunc_manifest(int jobid){ 84 | /* The containers we reclaim */ 85 | GHashTable *invalid_containers = g_hash_table_new_full(g_int64_hash, g_int64_equal, free, NULL); 86 | 87 | GHashTable *manifest = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, free); 88 | 89 | sds fname = sdsdup(destor.working_directory); 90 | fname = sdscat(fname, "/manifest"); 91 | FILE *fp = NULL; 92 | if((fp = fopen(fname, "r"))){ 93 | /* file exists. Reconstruct the manifest from the file. */ 94 | struct record tmp; 95 | while(fscanf(fp, "%lld,%d", &tmp.id, &tmp.time) == 2){ 96 | struct record* rec = (struct record*) malloc(sizeof(struct record)); 97 | if(tmp.time <= jobid){ 98 | /* This record can be removed. */ 99 | containerid *cid = (containerid*) malloc(sizeof(containerid)); 100 | *cid = tmp.id; 101 | g_hash_table_insert(invalid_containers, cid, NULL); 102 | NOTICE("CMA: container %lld can be reclaimed.", cid); 103 | }else{ 104 | /* This record remains valid. */ 105 | rec->id = tmp.id; 106 | rec->time = tmp.time; 107 | g_hash_table_insert(manifest, &rec->id, rec); 108 | } 109 | } 110 | 111 | NOTICE("CMA: %d of records are valid.", g_hash_table_size(manifest)); 112 | NOTICE("CMA: %d of records are going to be reclaimed.", g_hash_table_size(invalid_containers)); 113 | 114 | fclose(fp); 115 | }else{ 116 | NOTICE("manifest doesn't exist!"); 117 | exit(1); 118 | } 119 | 120 | if((fp = fopen(fname, "w"))){ 121 | GHashTableIter iter; 122 | gpointer key, value; 123 | g_hash_table_iter_init(&iter, manifest); 124 | while(g_hash_table_iter_next(&iter, &key, &value)){ 125 | struct record* rec = value; 126 | fprintf(fp, "%lld,%d\n", rec->id, rec->time); 127 | } 128 | fclose(fp); 129 | }else{ 130 | WARNING("CMA: cannot create manifest!"); 131 | exit(1); 132 | } 133 | 134 | destor.live_container_num = g_hash_table_size(manifest); 135 | 136 | g_hash_table_destroy(manifest); 137 | 138 | return invalid_containers; 139 | } 140 | -------------------------------------------------------------------------------- /src/fsl/read_fsl_trace.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2014 Sonam Mandal 3 | * Copyright (c) 2014 Vasily Tarasov 4 | * Copyright (c) 2014 Will Buik 5 | * Copyright (c) 2014 Erez Zadok 6 | * Copyright (c) 2014 Geoff Kuenning 7 | * Copyright (c) 2014 Stony Brook University 8 | * Copyright (c) 2014 Harvey Mudd College 9 | * Copyright (c) 2014 The Research Foundation of the State University of New York 10 | * 11 | * This program is free software; you can redistribute it and/or modify 12 | * it under the terms of the GNU General Public License version 2 as 13 | * published by the Free Software Foundation. 14 | */ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "../destor.h" 26 | #include "../jcr.h" 27 | #include "../backup.h" 28 | 29 | /* Use this macros if libhashfile library is installed on your system */ 30 | // #include 31 | 32 | /* Use this macros if libhashfile library is NOT installed on your system */ 33 | #include "libhashfile.h" 34 | 35 | #define MAXLINE 4096 36 | 37 | static void print_chunk_hash(uint64_t chunk_count, const uint8_t *hash, 38 | int hash_size_in_bytes) 39 | { 40 | int j; 41 | 42 | printf("Chunk %06"PRIu64 ": ", chunk_count); 43 | 44 | printf("%.2hhx", hash[0]); 45 | for (j = 1; j < hash_size_in_bytes; j++) 46 | printf(":%.2hhx", hash[j]); 47 | printf("\n"); 48 | } 49 | 50 | void* read_fsl_trace(void *argv) 51 | { 52 | char buf[MAXLINE]; 53 | struct hashfile_handle *handle; 54 | const struct chunk_info *ci; 55 | uint64_t chunk_count; 56 | time_t scan_start_time; 57 | int ret; 58 | 59 | handle = hashfile_open(jcr.path); 60 | if (!handle) { 61 | fprintf(stderr, "Error opening hash file: %d!", errno); 62 | exit(1); 63 | } 64 | 65 | /* Print some information about the hash file */ 66 | scan_start_time = hashfile_start_time(handle); 67 | printf("Collected at [%s] on %s", 68 | hashfile_sysid(handle), 69 | ctime(&scan_start_time)); 70 | 71 | ret = hashfile_chunking_method_str(handle, buf, MAXLINE); 72 | if (ret < 0) { 73 | fprintf(stderr, "Unrecognized chunking method: %d!", errno); 74 | exit(1); 75 | } 76 | 77 | printf("Chunking method: %s", buf); 78 | 79 | ret = hashfile_hashing_method_str(handle, buf, MAXLINE); 80 | if (ret < 0) { 81 | fprintf(stderr, "Unrecognized hashing method: %d!", errno); 82 | exit(1); 83 | } 84 | 85 | printf("Hashing method: %s\n", buf); 86 | 87 | /* Go over the files in a hashfile */ 88 | /*printf("== List of files and hashes ==\n");*/ 89 | while (1) { 90 | 91 | TIMER_DECLARE(1); 92 | TIMER_BEGIN(1); 93 | 94 | ret = hashfile_next_file(handle); 95 | 96 | TIMER_END(1, jcr.read_time); 97 | 98 | if (ret < 0) { 99 | fprintf(stderr, 100 | "Cannot get next file from a hashfile: %d!\n", 101 | errno); 102 | exit(1); 103 | } 104 | 105 | /* exit the loop if it was the last file */ 106 | if (ret == 0) 107 | break; 108 | 109 | /*printf("File path: %s\n", hashfile_curfile_path(handle));*/ 110 | /*printf("File size: %"PRIu64 " B\n",*/ 111 | /*hashfile_curfile_size(handle));*/ 112 | /*printf("Chunks number: %" PRIu64 "\n",*/ 113 | /*hashfile_curfile_numchunks(handle));*/ 114 | 115 | struct chunk* c = new_chunk(strlen(hashfile_curfile_path(handle))+1); 116 | strcpy(c->data, hashfile_curfile_path(handle)); 117 | 118 | VERBOSE("Read trace phase: %s", c->data); 119 | 120 | SET_CHUNK(c, CHUNK_FILE_START); 121 | 122 | sync_queue_push(trace_queue, c); 123 | 124 | /* Go over the chunks in the current file */ 125 | chunk_count = 0; 126 | while (1) { 127 | TIMER_BEGIN(1); 128 | ci = hashfile_next_chunk(handle); 129 | TIMER_END(1, jcr.read_time); 130 | 131 | if (!ci) /* exit the loop if it was the last chunk */ 132 | break; 133 | 134 | chunk_count++; 135 | 136 | c = new_chunk(0); 137 | 138 | /*print_chunk_hash(chunk_count, ci->hash,*/ 139 | /*hashfile_hash_size(handle) / 8);*/ 140 | 141 | c->size = ci->size; 142 | /* 143 | * Need some padding. 144 | */ 145 | memset(c->fp, 0, sizeof(fingerprint)); 146 | memcpy(c->fp, ci->hash, hashfile_hash_size(handle) / 8); 147 | 148 | sync_queue_push(trace_queue, c); 149 | 150 | } 151 | 152 | c = new_chunk(0); 153 | SET_CHUNK(c, CHUNK_FILE_END); 154 | sync_queue_push(trace_queue, c); 155 | 156 | } 157 | 158 | hashfile_close(handle); 159 | 160 | sync_queue_term(trace_queue); 161 | 162 | return NULL; 163 | } 164 | 165 | /*int main(int argc, char *argv[])*/ 166 | /*{*/ 167 | /*if (argc != 2) {*/ 168 | /*fprintf(stderr, "Wrong usage!\n");*/ 169 | /*fprintf(stderr, "Usage: %s \n", argv[0]);*/ 170 | /*return -1;*/ 171 | /*}*/ 172 | 173 | /*return translate_hashfile(argv[1]);*/ 174 | /*}*/ 175 | -------------------------------------------------------------------------------- /src/trace_phase.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "backup.h" 4 | 5 | void hash2code(unsigned char hash[20], char code[40]) { 6 | int i, j, b; 7 | unsigned char a, c; 8 | i = 0; 9 | for (i = 0; i < 20; i++) { 10 | a = hash[i]; 11 | for (j = 0; j < 2; j++) { 12 | b = a / 16; 13 | switch (b) { 14 | case 10: 15 | c = 'A'; 16 | break; 17 | case 11: 18 | c = 'B'; 19 | break; 20 | case 12: 21 | c = 'C'; 22 | break; 23 | case 13: 24 | c = 'D'; 25 | break; 26 | case 14: 27 | c = 'E'; 28 | break; 29 | case 15: 30 | c = 'F'; 31 | break; 32 | default: 33 | c = b + 48; 34 | break; 35 | 36 | } 37 | code[2 * i + j] = c; 38 | a = a << 4; 39 | } 40 | } 41 | } 42 | 43 | void code2hash(unsigned char code[40], unsigned char hash[20]) { 44 | bzero(hash, 20); 45 | int i, j; 46 | unsigned char a, b; 47 | for (i = 0; i < 20; i++) { 48 | for (j = 0; j < 2; j++) { 49 | a = code[2 * i + j]; 50 | switch (a) { //A is equal to a 51 | case 'A': 52 | b = 10; 53 | break; 54 | case 'a': 55 | b = 10; 56 | break; 57 | case 'B': 58 | b = 11; 59 | break; 60 | case 'b': 61 | b = 11; 62 | break; 63 | case 'C': 64 | b = 12; 65 | break; 66 | case 'c': 67 | b = 12; 68 | break; 69 | case 'D': 70 | b = 13; 71 | break; 72 | case 'd': 73 | b = 13; 74 | break; 75 | case 'E': 76 | b = 14; 77 | break; 78 | case 'e': 79 | b = 14; 80 | break; 81 | case 'F': 82 | b = 15; 83 | break; 84 | case 'f': 85 | b = 15; 86 | break; 87 | default: 88 | b = a - 48; 89 | break; 90 | } 91 | hash[i] = hash[i] * 16 + b; 92 | } 93 | } 94 | } 95 | 96 | void make_trace(char* path) { 97 | init_jcr(path); 98 | 99 | sds trace_file = sdsnew(path); 100 | 101 | char *p = trace_file + sdslen(trace_file) - 1; 102 | while (*p == '/') 103 | --p; 104 | *(p + 1) = 0; 105 | sdsupdatelen(trace_file); 106 | 107 | trace_file = sdscat(trace_file, ".trace"); 108 | NOTICE("output to %s", trace_file); 109 | 110 | start_read_phase(); 111 | start_chunk_phase(); 112 | start_hash_phase(); 113 | 114 | unsigned char code[41]; 115 | 116 | FILE *fp = fopen(trace_file, "w"); 117 | while (1) { 118 | struct chunk *c = sync_queue_pop(hash_queue); 119 | 120 | if (c == NULL) { 121 | break; 122 | } 123 | 124 | if (CHECK_CHUNK(c, CHUNK_FILE_START)) { 125 | destor_log(DESTOR_NOTICE, c->data); 126 | fprintf(fp, "file start %zd\n", strlen(c->data)); 127 | fprintf(fp, "%s\n", c->data); 128 | 129 | } else if (CHECK_CHUNK(c, CHUNK_FILE_END)) { 130 | fprintf(fp, "file end\n"); 131 | } else { 132 | hash2code(c->fp, code); 133 | code[40] = 0; 134 | fprintf(fp, "%s %d\n", code, c->size); 135 | } 136 | free_chunk(c); 137 | } 138 | 139 | fprintf(fp, "stream end"); 140 | fclose(fp); 141 | 142 | } 143 | 144 | static pthread_t trace_t; 145 | 146 | static void* read_trace_thread(void *argv) { 147 | 148 | FILE *trace_file = fopen(jcr.path, "r"); 149 | char line[128]; 150 | 151 | while (1) { 152 | TIMER_DECLARE(1); 153 | TIMER_BEGIN(1); 154 | fgets(line, 128, trace_file); 155 | TIMER_END(1, jcr.read_time); 156 | 157 | if (strcmp(line, "stream end") == 0) { 158 | sync_queue_term(trace_queue); 159 | break; 160 | } 161 | 162 | struct chunk* c; 163 | 164 | TIMER_BEGIN(1), 165 | 166 | assert(strncmp(line, "file start ", 11) == 0); 167 | int filenamelen; 168 | sscanf(line, "file start %d", &filenamelen); 169 | 170 | /* An additional '\n' is read */ 171 | c = new_chunk(filenamelen + 2); 172 | fgets(c->data, filenamelen + 2, trace_file); 173 | c->data[filenamelen] = 0; 174 | VERBOSE("Read trace phase: %s", c->data); 175 | 176 | SET_CHUNK(c, CHUNK_FILE_START); 177 | 178 | TIMER_END(1, jcr.read_time); 179 | 180 | sync_queue_push(trace_queue, c); 181 | 182 | TIMER_BEGIN(1); 183 | fgets(line, 128, trace_file); 184 | while (strncmp(line, "file end", 8) != 0) { 185 | c = new_chunk(0); 186 | 187 | char code[41]; 188 | strncpy(code, line, 40); 189 | code2hash(code, c->fp); 190 | 191 | c->size = atoi(line + 41); 192 | 193 | TIMER_END(1, jcr.read_time); 194 | sync_queue_push(trace_queue, c); 195 | TIMER_BEGIN(1), 196 | 197 | fgets(line, 128, trace_file); 198 | } 199 | 200 | c = new_chunk(0); 201 | SET_CHUNK(c, CHUNK_FILE_END); 202 | sync_queue_push(trace_queue, c); 203 | } 204 | 205 | fclose(trace_file); 206 | return NULL; 207 | } 208 | 209 | /* fsl/read_fsl_trace.c */ 210 | extern void* read_fsl_trace(void *argv); 211 | 212 | void start_read_trace_phase() { 213 | /* running job */ 214 | jcr.status = JCR_STATUS_RUNNING; 215 | trace_queue = sync_queue_new(100); 216 | if(destor.trace_format == TRACE_DESTOR) 217 | pthread_create(&trace_t, NULL, read_trace_thread, NULL); 218 | else if(destor.trace_format == TRACE_FSL) 219 | pthread_create(&trace_t, NULL, read_fsl_trace, NULL); 220 | else { 221 | NOTICE("Invalid trace format"); 222 | exit(1); 223 | } 224 | } 225 | 226 | void stop_read_trace_phase() { 227 | pthread_join(trace_t, NULL); 228 | NOTICE("read trace phase stops successfully!"); 229 | } 230 | -------------------------------------------------------------------------------- /src/utils/serial.c: -------------------------------------------------------------------------------- 1 | /* 2 | *Serialisation Support Functions 3 | */ 4 | /* 5 | 6 | NOTE: The following functions should work on any 7 | vaguely contemporary platform. Production 8 | builds should use optimised macros (void 9 | on platforms with network byte order and IEEE 10 | floating point format as native. 11 | 12 | */ 13 | /* serial_int16 -- Serialise a signed 16 bit integer. */ 14 | #include "serial.h" 15 | #include 16 | #include 17 | 18 | void serial_int16(uint8_t * * const ptr, const int16_t v) 19 | { 20 | int16_t vo = htons(v); 21 | 22 | memcpy(*ptr, &vo, sizeof vo); 23 | *ptr += sizeof vo; 24 | } 25 | 26 | /* serial_uint16 -- Serialise an unsigned 16 bit integer. */ 27 | 28 | void serial_uint16(uint8_t * * const ptr, const uint16_t v) 29 | { 30 | uint16_t vo = htons(v); 31 | 32 | memcpy(*ptr, &vo, sizeof vo); 33 | *ptr += sizeof vo; 34 | } 35 | 36 | /* serial_int32 -- Serialise a signed 32 bit integer. */ 37 | 38 | void serial_int32(uint8_t * * const ptr, const int32_t v) 39 | { 40 | int32_t vo = htonl(v); 41 | 42 | memcpy(*ptr, &vo, sizeof vo); 43 | *ptr += sizeof vo; 44 | } 45 | 46 | /* serial_uint32 -- Serialise an unsigned 32 bit integer. */ 47 | 48 | void serial_uint32(uint8_t * * const ptr, const uint32_t v) 49 | { 50 | uint32_t vo = htonl(v); 51 | 52 | memcpy(*ptr, &vo, sizeof vo); 53 | *ptr += sizeof vo; 54 | } 55 | 56 | /* serial_int64 -- Serialise a signed 64 bit integer. */ 57 | 58 | void serial_int64(uint8_t * * const ptr, const int64_t v) 59 | { 60 | if (htonl(1) == 1L) { 61 | memcpy(*ptr, &v, sizeof(int64_t)); 62 | } else { 63 | int i; 64 | uint8_t rv[sizeof(int64_t)]; 65 | uint8_t *pv = (uint8_t *) &v; 66 | 67 | for (i = 0; i < 8; i++) { 68 | rv[i] = pv[7 - i]; 69 | } 70 | memcpy(*ptr, &rv, sizeof(int64_t)); 71 | } 72 | *ptr += sizeof(int64_t); 73 | } 74 | 75 | 76 | /* serial_uint64 -- Serialise an unsigned 64 bit integer. */ 77 | 78 | void serial_uint64(uint8_t * * const ptr, const uint64_t v) 79 | { 80 | if (htonl(1) == 1L) { 81 | memcpy(*ptr, &v, sizeof(uint64_t)); 82 | } else { 83 | int i; 84 | uint8_t rv[sizeof(uint64_t)]; 85 | uint8_t *pv = (uint8_t *) &v; 86 | 87 | for (i = 0; i < 8; i++) { 88 | rv[i] = pv[7 - i]; 89 | } 90 | memcpy(*ptr, &rv, sizeof(uint64_t)); 91 | } 92 | *ptr += sizeof(uint64_t); 93 | } 94 | 95 | void serial_string(uint8_t * * const ptr, const char * const str) 96 | { 97 | int len = strlen(str) + 1; 98 | memcpy(*ptr, str, len); 99 | *ptr += len; 100 | } 101 | 102 | 103 | /* unserial_int16 -- Unserialise a signed 16 bit integer. */ 104 | 105 | int16_t unserial_int16(uint8_t * * const ptr) 106 | { 107 | int16_t vo; 108 | 109 | memcpy(&vo, *ptr, sizeof vo); 110 | *ptr += sizeof vo; 111 | return ntohs(vo); 112 | } 113 | 114 | /* unserial_uint16 -- Unserialise an unsigned 16 bit integer. */ 115 | 116 | uint16_t unserial_uint16(uint8_t * * const ptr) 117 | { 118 | uint16_t vo; 119 | 120 | memcpy(&vo, *ptr, sizeof vo); 121 | *ptr += sizeof vo; 122 | return ntohs(vo); 123 | } 124 | 125 | /* unserial_int32 -- Unserialise a signed 32 bit integer. */ 126 | 127 | int32_t unserial_int32(uint8_t * * const ptr) 128 | { 129 | int32_t vo; 130 | 131 | memcpy(&vo, *ptr, sizeof vo); 132 | *ptr += sizeof vo; 133 | return ntohl(vo); 134 | } 135 | 136 | /* unserial_uint32 -- Unserialise an unsigned 32 bit integer. */ 137 | 138 | uint32_t unserial_uint32(uint8_t * * const ptr) 139 | { 140 | uint32_t vo; 141 | 142 | memcpy(&vo, *ptr, sizeof vo); 143 | *ptr += sizeof vo; 144 | return ntohl(vo); 145 | } 146 | 147 | int64_t unserial_int64(uint8_t * * const ptr) 148 | { 149 | int64_t v; 150 | 151 | if (htonl(1) == 1L) { 152 | memcpy(&v, *ptr, sizeof(int64_t)); 153 | } else { 154 | int i; 155 | uint8_t rv[sizeof(int64_t)]; 156 | uint8_t *pv = (uint8_t *) &v; 157 | 158 | memcpy(&v, *ptr, sizeof(int64_t)); 159 | for (i = 0; i < 8; i++) { 160 | rv[i] = pv[7 - i]; 161 | } 162 | memcpy(&v, &rv, sizeof(int64_t)); 163 | } 164 | *ptr += sizeof(int64_t); 165 | return v; 166 | } 167 | 168 | /* unserial_uint64 -- Unserialise an unsigned 64 bit integer. */ 169 | uint64_t unserial_uint64(uint8_t * * const ptr) 170 | { 171 | uint64_t v; 172 | 173 | if (htonl(1) == 1L) { 174 | memcpy(&v, *ptr, sizeof(uint64_t)); 175 | } else { 176 | int i; 177 | uint8_t rv[sizeof(uint64_t)]; 178 | uint8_t *pv = (uint8_t *) &v; 179 | 180 | memcpy(&v, *ptr, sizeof(uint64_t)); 181 | for (i = 0; i < 8; i++) { 182 | rv[i] = pv[7 - i]; 183 | } 184 | memcpy(&v, &rv, sizeof(uint64_t)); 185 | } 186 | *ptr += sizeof(uint64_t); 187 | return v; 188 | } 189 | 190 | void unserial_string(uint8_t * * const ptr, char * const str) 191 | { 192 | int len = strlen((char *) *ptr) + 1; 193 | memcpy(str, (char *) *ptr, len); 194 | *ptr += len; 195 | } 196 | -------------------------------------------------------------------------------- /src/cbr_rewrite.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "rewrite_phase.h" 4 | #include "storage/containerstore.h" 5 | #include "backup.h" 6 | 7 | static int64_t chunk_num; 8 | 9 | static double get_rewrite_utility(struct chunk *c) { 10 | double rewrite_utility = 1; 11 | GSequenceIter *iter = g_sequence_lookup(rewrite_buffer.container_record_seq, 12 | &c->id, g_record_cmp_by_id, NULL); 13 | assert(iter); 14 | struct containerRecord *record = g_sequence_get(iter); 15 | double coverage = (record->size + c->size) / (double) (CONTAINER_SIZE - CONTAINER_META_SIZE); 16 | rewrite_utility = coverage >= 1 ? 0 : rewrite_utility - coverage; 17 | return rewrite_utility; 18 | } 19 | 20 | struct { 21 | int32_t chunk_num; 22 | double current_utility_threshold; 23 | int min_index; 24 | /* [0,1/10000), [1/10000, 2/10000), ... , [9999/10000, 1] */ 25 | int32_t buckets[10000]; 26 | } utility_buckets; 27 | 28 | /* init utility buckets */ 29 | void init_utility_buckets() { 30 | utility_buckets.chunk_num = 0; 31 | utility_buckets.min_index = destor.rewrite_cbr_minimal_utility == 1 ? 32 | 9999 : destor.rewrite_cbr_minimal_utility * 10000; 33 | utility_buckets.current_utility_threshold = 34 | destor.rewrite_cbr_minimal_utility; 35 | bzero(&utility_buckets.buckets, sizeof(utility_buckets.buckets)); 36 | } 37 | 38 | static void utility_buckets_update(double rewrite_utility) { 39 | utility_buckets.chunk_num++; 40 | int index = rewrite_utility >= 1 ? 9999 : rewrite_utility * 10000; 41 | utility_buckets.buckets[index]++; 42 | if (utility_buckets.chunk_num >= 100) { 43 | int best_num = utility_buckets.chunk_num * destor.rewrite_cbr_limit; 44 | int current_index = 9999; 45 | int count = 0; 46 | for (; current_index >= utility_buckets.min_index; --current_index) { 47 | count += utility_buckets.buckets[current_index]; 48 | if (count >= best_num) { 49 | break; 50 | } 51 | } 52 | utility_buckets.current_utility_threshold = (current_index + 1) 53 | / 10000.0; 54 | } 55 | } 56 | 57 | /* --------------------------------------------------------------------------*/ 58 | /** 59 | * @Synopsis Reducing impact of data fragmentation caused by in-line deduplication. 60 | * In SYSTOR'12. 61 | * 62 | * We first buffer a fixed-sized buffer of chunks for the decision chunk. 63 | * Then, find all chunks in an identical container with the decision chunk. 64 | * If these chunks are large enough, they are all not fragmentation. 65 | * An important optimization is that, 66 | * if we find a decision chunk already being marked not fragmented, 67 | * we should ensure its physical neighbors in the buffer also being marked not fragmented. 68 | * This optimization is very important for CBR. 69 | * @Param 70 | * 71 | * @Returns 72 | */ 73 | /* ----------------------------------------------------------------------------*/ 74 | void *cbr_rewrite(void* arg) { 75 | 76 | init_utility_buckets(); 77 | 78 | /* content-based rewrite*/ 79 | while (1) { 80 | struct chunk *c = sync_queue_pop(dedup_queue); 81 | if (c == NULL) 82 | break; 83 | 84 | TIMER_DECLARE(1); 85 | TIMER_BEGIN(1); 86 | 87 | if (!rewrite_buffer_push(c)) { 88 | TIMER_END(1, jcr.rewrite_time); 89 | continue; 90 | } 91 | 92 | TIMER_BEGIN(1); 93 | struct chunk *decision_chunk = rewrite_buffer_top(); 94 | while (CHECK_CHUNK(decision_chunk, CHUNK_FILE_START) 95 | || CHECK_CHUNK(decision_chunk, CHUNK_FILE_END) 96 | || CHECK_CHUNK(decision_chunk, CHUNK_SEGMENT_START) 97 | || CHECK_CHUNK(decision_chunk, CHUNK_SEGMENT_END)) { 98 | rewrite_buffer_pop(); 99 | TIMER_END(1, jcr.rewrite_time); 100 | sync_queue_push(rewrite_queue, decision_chunk); 101 | TIMER_BEGIN(1); 102 | decision_chunk = rewrite_buffer_top(); 103 | } 104 | 105 | TIMER_BEGIN(1); 106 | /* A normal chunk */ 107 | double rewrite_utility = 0; 108 | 109 | if (decision_chunk->id != TEMPORARY_ID) { 110 | assert(CHECK_CHUNK(decision_chunk, CHUNK_DUPLICATE)); 111 | /* a duplicate chunk */ 112 | GSequenceIter *iter = g_sequence_lookup( 113 | rewrite_buffer.container_record_seq, &decision_chunk->id, 114 | g_record_cmp_by_id, NULL); 115 | assert(iter); 116 | struct containerRecord *record = g_sequence_get(iter); 117 | 118 | if (record->out_of_order == 1) { 119 | rewrite_utility = get_rewrite_utility(decision_chunk); 120 | if (rewrite_utility < destor.rewrite_cbr_minimal_utility 121 | || rewrite_utility < utility_buckets.current_utility_threshold) { 122 | record->out_of_order = 0; 123 | } else { 124 | VERBOSE("Rewrite phase: %lldth chunk is in out-of-order container %lld", 125 | chunk_num, decision_chunk->id); 126 | SET_CHUNK(decision_chunk, CHUNK_OUT_OF_ORDER); 127 | } 128 | 129 | } else { 130 | /* if marked as not out of order*/ 131 | rewrite_utility = 0; 132 | } 133 | } 134 | 135 | utility_buckets_update(rewrite_utility); 136 | chunk_num++; 137 | 138 | rewrite_buffer_pop(); 139 | TIMER_END(1, jcr.rewrite_time); 140 | sync_queue_push(rewrite_queue, decision_chunk); 141 | } 142 | 143 | /* process the remaining chunks in stream context */ 144 | struct chunk *remaining_chunk = NULL; 145 | while ((remaining_chunk = rewrite_buffer_pop())) 146 | sync_queue_push(rewrite_queue, remaining_chunk); 147 | sync_queue_term(rewrite_queue); 148 | 149 | return NULL; 150 | } 151 | -------------------------------------------------------------------------------- /src/utils/serial.h: -------------------------------------------------------------------------------- 1 | /* Serialisation support functions from serial.c. */ 2 | #include 3 | #include 4 | 5 | extern void serial_int16(uint8_t * * const ptr, const int16_t v); 6 | extern void serial_uint16(uint8_t * * const ptr, const uint16_t v); 7 | extern void serial_int32(uint8_t * * const ptr, const int32_t v); 8 | extern void serial_uint32(uint8_t * * const ptr, const uint32_t v); 9 | extern void serial_int64(uint8_t * * ptr, int64_t v); 10 | extern void serial_uint64(uint8_t * * const ptr, const uint64_t v); 11 | //extern void serial_btime(uint8_t * * const ptr, const btime_t v); 12 | //extern void serial_float64(uint8_t * * const ptr, const float64_t v); 13 | extern void serial_string(uint8_t * * const ptr, const char * const str); 14 | 15 | extern int16_t unserial_int16(uint8_t * * const ptr); 16 | extern uint16_t unserial_uint16(uint8_t * * const ptr); 17 | extern int32_t unserial_int32(uint8_t * * const ptr); 18 | extern uint32_t unserial_uint32(uint8_t * * const ptr); 19 | extern int64_t unserial_int64(uint8_t * * const ptr); 20 | extern uint64_t unserial_uint64(uint8_t * * const ptr); 21 | //extern btime_t unserial_btime(uint8_t * * const ptr); 22 | //extern float64_t unserial_float64(uint8_t * * const ptr); 23 | extern void unserial_string(uint8_t * * const ptr, char * const str); 24 | 25 | /* 26 | 27 | Serialisation Macros 28 | 29 | These macros use a uint8_t pointer, ser_ptr, which must be 30 | defined by the code which uses them. 31 | 32 | */ 33 | 34 | #ifndef __SERIAL_H_ 35 | #define __SERIAL_H_ 1 36 | 37 | /* ser_declare -- Declare ser_ptr locally within a function. */ 38 | #define ser_declare uint8_t *ser_ptr 39 | #define unser_declare uint8_t *ser_ptr 40 | 41 | /* ser_begin(x, s) -- Begin serialisation into a buffer x of size s. */ 42 | #define ser_begin(x, s) ser_ptr = ((uint8_t *)(x)) 43 | #define unser_begin(x, s) ser_ptr = ((uint8_t *)(x)) 44 | 45 | /* ser_length -- Determine length in bytes of serialised into a 46 | buffer x. */ 47 | #define ser_length(x) (ser_ptr - (uint8_t *)(x)) 48 | #define unser_length(x) (ser_ptr - (uint8_t *)(x)) 49 | 50 | /* ser_end(x, s) -- End serialisation into a buffer x of size s. */ 51 | #define ser_end(x, s) assert(ser_length(x) <= (s)) 52 | #define unser_end(x, s) assert(ser_length(x) <= (s)) 53 | 54 | /* ser_check(x, s) -- Verify length of serialised data in buffer x is 55 | expected length s. */ 56 | #define ser_check(x, s) assert(ser_length(x) == (s)) 57 | 58 | /* Serialisation */ 59 | 60 | /* 8 bit signed integer */ 61 | #define ser_int8(x) *ser_ptr++ = (x) 62 | /* 8 bit unsigned integer */ 63 | #define ser_uint8(x) *ser_ptr++ = (x) 64 | 65 | /* 16 bit signed integer */ 66 | #define ser_int16(x) serial_int16(&ser_ptr, x) 67 | /* 16 bit unsigned integer */ 68 | #define ser_uint16(x) serial_uint16(&ser_ptr, x) 69 | 70 | /* 32 bit signed integer */ 71 | #define ser_int32(x) serial_int32(&ser_ptr, x) 72 | /* 32 bit unsigned integer */ 73 | #define ser_uint32(x) serial_uint32(&ser_ptr, x) 74 | 75 | /* 64 bit signed integer */ 76 | #define ser_int64(x) serial_int64(&ser_ptr, x) 77 | /* 64 bit unsigned integer */ 78 | #define ser_uint64(x) serial_uint64(&ser_ptr, x) 79 | 80 | /* btime -- 64 bit unsigned integer */ 81 | #define ser_btime(x) serial_btime(&ser_ptr, x) 82 | 83 | /* 64 bit IEEE floating point number */ 84 | #define ser_float64(x) serial_float64(&ser_ptr, x) 85 | 86 | /* Binary byte stream len bytes not requiring serialisation */ 87 | #define ser_bytes(x, len) memcpy(ser_ptr, (x), (len)), ser_ptr += (len) 88 | 89 | /* Binary byte stream not requiring serialisation (length obtained by sizeof) */ 90 | #define ser_struct(x) ser_bytes(&(x), (sizeof (x))) 91 | 92 | /* Binary string not requiring serialization */ 93 | #define ser_string(x) serial_string(&ser_ptr, (x)) 94 | 95 | /* Unserialisation */ 96 | 97 | /* 8 bit signed integer */ 98 | #define unser_int8(x) (x) = *ser_ptr++ 99 | /* 8 bit unsigned integer */ 100 | #define unser_uint8(x) (x) = *ser_ptr++ 101 | 102 | /* 16 bit signed integer */ 103 | #define unser_int16(x) (x) = unserial_int16(&ser_ptr) 104 | /* 16 bit unsigned integer */ 105 | #define unser_uint16(x) (x) = unserial_uint16(&ser_ptr) 106 | 107 | /* 32 bit signed integer */ 108 | #define unser_int32(x) (x) = unserial_int32(&ser_ptr) 109 | /* 32 bit unsigned integer */ 110 | #define unser_uint32(x) (x) = unserial_uint32(&ser_ptr) 111 | 112 | /* 64 bit signed integer */ 113 | #define unser_int64(x) (x) = unserial_int64(&ser_ptr) 114 | /* 64 bit unsigned integer */ 115 | #define unser_uint64(x) (x) = unserial_uint64(&ser_ptr) 116 | 117 | /* btime -- 64 bit unsigned integer */ 118 | #define unser_btime(x) (x) = unserial_btime(&ser_ptr) 119 | 120 | /* 64 bit IEEE floating point number */ 121 | #define unser_float64(x)(x) = unserial_float64(&ser_ptr) 122 | 123 | /* Binary byte stream len bytes not requiring serialisation */ 124 | #define unser_bytes(x, len) memcpy((x), ser_ptr, (len)), ser_ptr += (len) 125 | 126 | /* Binary byte stream not requiring serialisation (length obtained by sizeof) */ 127 | #define unser_struct(x) unser_bytes(&(x), (sizeof (x))) 128 | 129 | /* Binary string not requiring serialization */ 130 | #define unser_string(x) unserial_string(&ser_ptr, (x)) 131 | 132 | #endif /* __SERIAL_H_ */ 133 | -------------------------------------------------------------------------------- /src/assembly_restore.c: -------------------------------------------------------------------------------- 1 | /* 2 | * assembly_restore.c 3 | * 4 | * Created on: Nov 27, 2013 5 | * Author: fumin 6 | */ 7 | #include "destor.h" 8 | #include "jcr.h" 9 | #include "recipe/recipestore.h" 10 | #include "storage/containerstore.h" 11 | #include "restore.h" 12 | 13 | struct { 14 | GSequence *area; 15 | int64_t area_size; 16 | int64_t size; 17 | } assembly_area; 18 | 19 | static void init_assembly_area() { 20 | assembly_area.area = g_sequence_new(NULL); 21 | assembly_area.size = 0; 22 | assembly_area.area_size = (destor.restore_cache[1] - 1) * CONTAINER_SIZE; 23 | } 24 | 25 | /* 26 | * Forward assembly. 27 | * Return a queue of assembled chunks. 28 | * Return NULL if area is empty. 29 | */ 30 | static GQueue* assemble_area() { 31 | 32 | if (g_sequence_get_length(assembly_area.area) == 0) 33 | return NULL; 34 | 35 | GQueue *q = g_queue_new(); 36 | 37 | struct chunk *c = NULL; 38 | GSequenceIter *begin = g_sequence_get_begin_iter(assembly_area.area); 39 | GSequenceIter *end = g_sequence_get_end_iter(assembly_area.area); 40 | for (;begin != end;begin = g_sequence_get_begin_iter(assembly_area.area)) { 41 | c = g_sequence_get(begin); 42 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) { 43 | g_sequence_remove(begin); 44 | g_queue_push_tail(q, c); 45 | c = NULL; 46 | } else { 47 | break; 48 | } 49 | } 50 | 51 | /* !c == true indicates no more chunks in the area. */ 52 | if (!c) 53 | return q; 54 | 55 | /* read a container */ 56 | containerid id = c->id; 57 | struct container *con = NULL; 58 | jcr.read_container_num++; 59 | VERBOSE("Restore cache: container %lld is missed", id); 60 | if (destor.simulation_level == SIMULATION_NO) 61 | con = retrieve_container_by_id(id); 62 | 63 | /* assemble the area */ 64 | GSequenceIter *iter = g_sequence_get_begin_iter(assembly_area.area); 65 | end = g_sequence_get_end_iter(assembly_area.area); 66 | for (;iter != end;iter = g_sequence_iter_next(iter)) { 67 | c = g_sequence_get(iter); 68 | if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END) 69 | && id == c->id) { 70 | if (destor.simulation_level == SIMULATION_NO) { 71 | struct chunk *buf = get_chunk_in_container(con, &c->fp); 72 | assert(c->size == buf->size); 73 | c->data = malloc(c->size); 74 | memcpy(c->data, buf->data, c->size); 75 | free_chunk(buf); 76 | } 77 | SET_CHUNK(c, CHUNK_READY); 78 | } 79 | } 80 | 81 | /* issue the assembled area */ 82 | begin = g_sequence_get_begin_iter(assembly_area.area); 83 | end = g_sequence_get_end_iter(assembly_area.area); 84 | for (;begin != end;begin = g_sequence_get_begin_iter(assembly_area.area)) { 85 | struct chunk *rc = g_sequence_get(begin); 86 | if (CHECK_CHUNK(rc, CHUNK_FILE_START) || CHECK_CHUNK(rc, CHUNK_FILE_END)){ 87 | g_sequence_remove(begin); 88 | g_queue_push_tail(q, rc); 89 | }else if(CHECK_CHUNK(rc, CHUNK_READY)) { 90 | g_sequence_remove(begin); 91 | g_queue_push_tail(q, rc); 92 | assembly_area.size -= rc->size; 93 | } else { 94 | break; 95 | } 96 | } 97 | return q; 98 | } 99 | 100 | static int assembly_area_push(struct chunk* c) { 101 | /* Indicates end */ 102 | if (c == NULL) 103 | return 1; 104 | 105 | g_sequence_append(assembly_area.area, c); 106 | 107 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) 108 | return 0; 109 | 110 | assembly_area.size += c->size; 111 | 112 | if (assembly_area.size >= assembly_area.area_size) 113 | return 1; 114 | 115 | return 0; 116 | } 117 | 118 | void* assembly_restore_thread(void *arg) { 119 | init_assembly_area(); 120 | 121 | struct chunk* c; 122 | while ((c = sync_queue_pop(restore_recipe_queue))) { 123 | 124 | TIMER_DECLARE(1); 125 | TIMER_BEGIN(1); 126 | 127 | if (assembly_area_push(c)) { 128 | /* Full */ 129 | GQueue *q = assemble_area(); 130 | 131 | TIMER_END(1, jcr.read_chunk_time); 132 | 133 | struct chunk* rc; 134 | while ((rc = g_queue_pop_head(q))) { 135 | if (CHECK_CHUNK(rc, CHUNK_FILE_START) 136 | || CHECK_CHUNK(rc, CHUNK_FILE_END)) { 137 | sync_queue_push(restore_chunk_queue, rc); 138 | continue; 139 | } 140 | jcr.data_size += rc->size; 141 | jcr.chunk_num++; 142 | if (destor.simulation_level >= SIMULATION_RESTORE) { 143 | /* Simulating restore. */ 144 | free_chunk(rc); 145 | } else { 146 | sync_queue_push(restore_chunk_queue, rc); 147 | } 148 | } 149 | 150 | g_queue_free(q); 151 | } else { 152 | TIMER_END(1, jcr.read_chunk_time); 153 | } 154 | 155 | } 156 | 157 | assembly_area_push(NULL); 158 | 159 | GQueue *q; 160 | TIMER_DECLARE(1); 161 | TIMER_BEGIN(1); 162 | while ((q = assemble_area())) { 163 | TIMER_END(1, jcr.read_chunk_time); 164 | struct chunk* rc; 165 | while ((rc = g_queue_pop_head(q))) { 166 | 167 | if (CHECK_CHUNK(rc,CHUNK_FILE_START) 168 | || CHECK_CHUNK(rc, CHUNK_FILE_END)) { 169 | sync_queue_push(restore_chunk_queue, rc); 170 | continue; 171 | } 172 | 173 | jcr.data_size += rc->size; 174 | jcr.chunk_num++; 175 | if (destor.simulation_level >= SIMULATION_RESTORE) { 176 | /* Simulating restore. */ 177 | free_chunk(rc); 178 | } else { 179 | sync_queue_push(restore_chunk_queue, rc); 180 | } 181 | } 182 | TIMER_BEGIN(1); 183 | g_queue_free(q); 184 | } 185 | 186 | sync_queue_term(restore_chunk_queue); 187 | return NULL; 188 | } 189 | -------------------------------------------------------------------------------- /src/har_rewrite.c: -------------------------------------------------------------------------------- 1 | /* 2 | * har_rewrite.c 3 | * 4 | * Created on: Nov 27, 2013 5 | * Author: fumin 6 | */ 7 | 8 | #include "destor.h" 9 | #include "rewrite_phase.h" 10 | #include "storage/containerstore.h" 11 | #include "jcr.h" 12 | #include "cma.h" 13 | 14 | static GHashTable *container_utilization_monitor; 15 | static GHashTable *inherited_sparse_containers; 16 | 17 | void init_har() { 18 | 19 | /* Monitor the utilizations of containers */ 20 | container_utilization_monitor = g_hash_table_new_full( 21 | g_int64_hash, g_int64_equal, NULL, free); 22 | 23 | /* IDs of inherited sparse containers */ 24 | inherited_sparse_containers = g_hash_table_new_full(g_int64_hash, 25 | g_int64_equal, NULL, free); 26 | 27 | /* The first backup doesn't have inherited sparse containers. */ 28 | if (jcr.id > 0) { 29 | 30 | sds fname = sdsdup(destor.working_directory); 31 | fname = sdscat(fname, "recipes/bv"); 32 | char s[20]; 33 | sprintf(s, "%d", jcr.id - 1); 34 | fname = sdscat(fname, s); 35 | fname = sdscat(fname, ".sparse"); 36 | 37 | FILE* sparse_file = fopen(fname, "r"); 38 | 39 | if (sparse_file) { 40 | char buf[128]; 41 | while (fgets(buf, 128, sparse_file) != NULL) { 42 | struct containerRecord *record = 43 | (struct containerRecord*) malloc( 44 | sizeof(struct containerRecord)); 45 | sscanf(buf, "%lld %d", &record->cid, &record->size); 46 | 47 | g_hash_table_insert(inherited_sparse_containers, &record->cid, 48 | record); 49 | } 50 | fclose(sparse_file); 51 | } 52 | 53 | sdsfree(fname); 54 | } 55 | 56 | NOTICE("Read %d inherited sparse containers", 57 | g_hash_table_size(inherited_sparse_containers)); 58 | 59 | } 60 | 61 | void har_monitor_update(containerid id, int32_t size) { 62 | TIMER_DECLARE(1); 63 | TIMER_BEGIN(1); 64 | struct containerRecord* record = g_hash_table_lookup( 65 | container_utilization_monitor, &id); 66 | if (record) { 67 | record->size += size; 68 | } else { 69 | 70 | record = (struct containerRecord*) malloc( 71 | sizeof(struct containerRecord)); 72 | record->cid = id; 73 | record->size = 0; 74 | g_hash_table_insert(container_utilization_monitor, 75 | &record->cid, record); 76 | 77 | record->size += size; 78 | 79 | } 80 | TIMER_END(1, jcr.rewrite_time); 81 | } 82 | 83 | static gint g_record_cmp(struct containerRecord *a, struct containerRecord* b, gpointer user_data){ 84 | return a->size - b->size; 85 | } 86 | 87 | void close_har() { 88 | sds fname = sdsdup(destor.working_directory); 89 | fname = sdscat(fname, "recipes/bv"); 90 | char s[20]; 91 | sprintf(s, "%d", jcr.id); 92 | fname = sdscat(fname, s); 93 | fname = sdscat(fname, ".sparse"); 94 | 95 | FILE* fp = fopen(fname, "w"); 96 | if (!fp) { 97 | fprintf(stderr, "Can not create sparse file"); 98 | perror("The reason is"); 99 | exit(1); 100 | } 101 | 102 | jcr.total_container_num = g_hash_table_size(container_utilization_monitor); 103 | 104 | GSequence *seq = g_sequence_new(NULL); 105 | int64_t total_size = 0; 106 | int64_t sparse_size = 0; 107 | 108 | /* collect sparse containers */ 109 | GHashTableIter iter; 110 | gpointer key, value; 111 | g_hash_table_iter_init(&iter, container_utilization_monitor); 112 | while (g_hash_table_iter_next(&iter, &key, &value)) { 113 | struct containerRecord* cr = (struct containerRecord*) value; 114 | total_size += cr->size; 115 | 116 | if((1.0*cr->size/(CONTAINER_SIZE - CONTAINER_META_SIZE)) 117 | < destor.rewrite_har_utilization_threshold){ 118 | /* It is sparse */ 119 | if (inherited_sparse_containers 120 | && g_hash_table_lookup(inherited_sparse_containers, &cr->cid)) 121 | /* It is an inherited sparse container */ 122 | jcr.inherited_sparse_num++; 123 | 124 | jcr.sparse_container_num++; 125 | sparse_size += cr->size; 126 | 127 | g_sequence_insert_sorted(seq, cr, g_record_cmp, NULL); 128 | } 129 | } 130 | 131 | /* 132 | * If the sparse size is too large, 133 | * we need to trim the sequence to control the rewrite ratio. 134 | * We use sparse_size/total_size to estimate the rewrite ratio of next backup. 135 | * However, the estimation is inaccurate (generally over-estimating), since: 136 | * 1. the sparse size is not an accurate indicator of utilization for next backup. 137 | * 2. self-references. 138 | */ 139 | while(destor.rewrite_har_rewrite_limit < 1 140 | && sparse_size*1.0/total_size > destor.rewrite_har_rewrite_limit){ 141 | /* 142 | * The expected rewrite ratio exceeds the limit. 143 | * We trim the last several records in the sequence. 144 | * */ 145 | GSequenceIter* iter = g_sequence_iter_prev(g_sequence_get_end_iter(seq)); 146 | struct containerRecord* r = g_sequence_get(iter); 147 | VERBOSE("Trim sparse container %lld", r->cid); 148 | sparse_size -= r->size; 149 | g_sequence_remove(iter); 150 | } 151 | 152 | GSequenceIter* sparse_iter = g_sequence_get_begin_iter(seq); 153 | while(sparse_iter != g_sequence_get_end_iter(seq)){ 154 | struct containerRecord* r = g_sequence_get(sparse_iter); 155 | fprintf(fp, "%lld %d\n", r->cid, r->size); 156 | sparse_iter = g_sequence_iter_next(sparse_iter); 157 | } 158 | fclose(fp); 159 | 160 | NOTICE("Record %d sparse containers, and %d of them are inherited", 161 | g_sequence_get_length(seq), jcr.inherited_sparse_num); 162 | 163 | g_sequence_free(seq); 164 | sdsfree(fname); 165 | 166 | /* CMA: update the backup times in manifest */ 167 | update_manifest(container_utilization_monitor); 168 | } 169 | 170 | void har_check(struct chunk* c) { 171 | if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END) 172 | && CHECK_CHUNK(c, CHUNK_DUPLICATE)) 173 | if (g_hash_table_lookup(inherited_sparse_containers, &c->id)) { 174 | SET_CHUNK(c, CHUNK_SPARSE); 175 | char code[41]; 176 | hash2code(c->fp, code); 177 | code[40] = 0; 178 | VERBOSE("chunk %s in sparse container %ld", code, c->id); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/index/kvstore_htable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * kvstore_htable.c 3 | * 4 | * Created on: Mar 23, 2014 5 | * Author: fumin 6 | */ 7 | 8 | #include "../destor.h" 9 | #include "index.h" 10 | 11 | typedef char* kvpair; 12 | 13 | #define get_key(kv) (kv) 14 | #define get_value(kv) ((int64_t*)(kv+destor.index_key_size)) 15 | 16 | static GHashTable *htable; 17 | 18 | static int32_t kvpair_size; 19 | 20 | /* 21 | * Create a new kv pair. 22 | */ 23 | static kvpair new_kvpair_full(char* key){ 24 | kvpair kvp = malloc(kvpair_size); 25 | memcpy(get_key(kvp), key, destor.index_key_size); 26 | int64_t* values = get_value(kvp); 27 | int i; 28 | for(i = 0; i=4) 63 | htable = g_hash_table_new_full(g_int_hash, g_feature_equal, 64 | free_kvpair, NULL); 65 | else 66 | htable = g_hash_table_new_full(g_feature_hash, g_feature_equal, 67 | free_kvpair, NULL); 68 | 69 | sds indexpath = sdsdup(destor.working_directory); 70 | indexpath = sdscat(indexpath, "index/htable"); 71 | 72 | /* Initialize the feature index from the dump file. */ 73 | FILE *fp; 74 | if ((fp = fopen(indexpath, "r"))) { 75 | /* The number of features */ 76 | int key_num; 77 | fread(&key_num, sizeof(int), 1, fp); 78 | for (; key_num > 0; key_num--) { 79 | /* Read a feature */ 80 | kvpair kv = new_kvpair(); 81 | fread(get_key(kv), destor.index_key_size, 1, fp); 82 | 83 | /* The number of segments/containers the feature refers to. */ 84 | int id_num, i; 85 | fread(&id_num, sizeof(int), 1, fp); 86 | assert(id_num <= destor.index_value_length); 87 | 88 | for (i = 0; i < id_num; i++) 89 | /* Read an ID */ 90 | fread(&get_value(kv)[i], sizeof(int64_t), 1, fp); 91 | 92 | g_hash_table_insert(htable, get_key(kv), kv); 93 | } 94 | fclose(fp); 95 | } 96 | 97 | sdsfree(indexpath); 98 | } 99 | 100 | void close_kvstore_htable() { 101 | sds indexpath = sdsdup(destor.working_directory); 102 | indexpath = sdscat(indexpath, "index/htable"); 103 | 104 | FILE *fp; 105 | if ((fp = fopen(indexpath, "w")) == NULL) { 106 | perror("Can not open index/htable for write because:"); 107 | exit(1); 108 | } 109 | 110 | NOTICE("flushing hash table!"); 111 | int key_num = g_hash_table_size(htable); 112 | fwrite(&key_num, sizeof(int), 1, fp); 113 | 114 | GHashTableIter iter; 115 | gpointer key, value; 116 | g_hash_table_iter_init(&iter, htable); 117 | while (g_hash_table_iter_next(&iter, &key, &value)) { 118 | 119 | /* Write a feature. */ 120 | kvpair kv = value; 121 | if(fwrite(get_key(kv), destor.index_key_size, 1, fp) != 1){ 122 | perror("Fail to write a key!"); 123 | exit(1); 124 | } 125 | 126 | /* Write the number of segments/containers */ 127 | if(fwrite(&destor.index_value_length, sizeof(int), 1, fp) != 1){ 128 | perror("Fail to write a length!"); 129 | exit(1); 130 | } 131 | int i; 132 | for (i = 0; i < destor.index_value_length; i++) 133 | if(fwrite(&get_value(kv)[i], sizeof(int64_t), 1, fp) != 1){ 134 | perror("Fail to write a value!"); 135 | exit(1); 136 | } 137 | 138 | } 139 | 140 | /* It is a rough estimation */ 141 | destor.index_memory_footprint = g_hash_table_size(htable) 142 | * (destor.index_key_size + sizeof(int64_t) * destor.index_value_length + 4); 143 | 144 | fclose(fp); 145 | 146 | NOTICE("flushing hash table successfully!"); 147 | 148 | sdsfree(indexpath); 149 | 150 | g_hash_table_destroy(htable); 151 | } 152 | 153 | /* 154 | * For top-k selection method. 155 | */ 156 | int64_t* kvstore_htable_lookup(char* key) { 157 | kvpair kv = g_hash_table_lookup(htable, key); 158 | return kv ? get_value(kv) : NULL; 159 | } 160 | 161 | void kvstore_htable_update(char* key, int64_t id) { 162 | kvpair kv = g_hash_table_lookup(htable, key); 163 | if (!kv) { 164 | kv = new_kvpair_full(key); 165 | g_hash_table_replace(htable, get_key(kv), kv); 166 | } 167 | kv_update(kv, id); 168 | } 169 | 170 | /* Remove the 'id' from the kvpair identified by 'key' */ 171 | void kvstore_htable_delete(char* key, int64_t id){ 172 | kvpair kv = g_hash_table_lookup(htable, key); 173 | if(!kv) 174 | return; 175 | 176 | int64_t *value = get_value(kv); 177 | int i; 178 | for(i=0; i size ? size : destor.chunk_avg_size; 14 | } 15 | 16 | /* 17 | * chunk-level deduplication. 18 | * Destor currently supports fixed-sized chunking and (normalized) rabin-based chunking. 19 | */ 20 | static void* chunk_thread(void *arg) { 21 | int leftlen = 0; 22 | int leftoff = 0; 23 | unsigned char *leftbuf = malloc(DEFAULT_BLOCK_SIZE + destor.chunk_max_size); 24 | 25 | unsigned char *zeros = malloc(destor.chunk_max_size); 26 | bzero(zeros, destor.chunk_max_size); 27 | unsigned char *data = malloc(destor.chunk_max_size); 28 | 29 | struct chunk* c = NULL; 30 | 31 | while (1) { 32 | 33 | /* Try to receive a CHUNK_FILE_START. */ 34 | c = sync_queue_pop(read_queue); 35 | 36 | if (c == NULL) { 37 | sync_queue_term(chunk_queue); 38 | break; 39 | } 40 | 41 | assert(CHECK_CHUNK(c, CHUNK_FILE_START)); 42 | sync_queue_push(chunk_queue, c); 43 | 44 | /* Try to receive normal chunks. */ 45 | c = sync_queue_pop(read_queue); 46 | if (!CHECK_CHUNK(c, CHUNK_FILE_END)) { 47 | memcpy(leftbuf, c->data, c->size); 48 | leftlen += c->size; 49 | free_chunk(c); 50 | c = NULL; 51 | } 52 | 53 | while (1) { 54 | /* c == NULL indicates more data for this file can be read. */ 55 | while ((leftlen < destor.chunk_max_size) && c == NULL) { 56 | c = sync_queue_pop(read_queue); 57 | if (!CHECK_CHUNK(c, CHUNK_FILE_END)) { 58 | memmove(leftbuf, leftbuf + leftoff, leftlen); 59 | leftoff = 0; 60 | memcpy(leftbuf + leftlen, c->data, c->size); 61 | leftlen += c->size; 62 | free_chunk(c); 63 | c = NULL; 64 | } 65 | } 66 | 67 | if (leftlen == 0) { 68 | assert(c); 69 | break; 70 | } 71 | 72 | TIMER_DECLARE(1); 73 | TIMER_BEGIN(1); 74 | 75 | int chunk_size = chunking(leftbuf + leftoff, leftlen); 76 | 77 | TIMER_END(1, jcr.chunk_time); 78 | 79 | struct chunk *nc = new_chunk(chunk_size); 80 | memcpy(nc->data, leftbuf + leftoff, chunk_size); 81 | leftlen -= chunk_size; 82 | leftoff += chunk_size; 83 | 84 | if (memcmp(zeros, nc->data, chunk_size) == 0) { 85 | VERBOSE("Chunk phase: %ldth chunk of %d zero bytes", 86 | chunk_num++, chunk_size); 87 | jcr.zero_chunk_num++; 88 | jcr.zero_chunk_size += chunk_size; 89 | } else 90 | VERBOSE("Chunk phase: %ldth chunk of %d bytes", chunk_num++, 91 | chunk_size); 92 | 93 | sync_queue_push(chunk_queue, nc); 94 | } 95 | 96 | sync_queue_push(chunk_queue, c); 97 | leftoff = 0; 98 | c = NULL; 99 | 100 | if(destor.chunk_algorithm == CHUNK_RABIN || 101 | destor.chunk_algorithm == CHUNK_NORMALIZED_RABIN) 102 | windows_reset(); 103 | 104 | } 105 | 106 | free(leftbuf); 107 | free(zeros); 108 | free(data); 109 | return NULL; 110 | } 111 | 112 | 113 | 114 | void start_chunk_phase() { 115 | 116 | if (destor.chunk_algorithm == CHUNK_RABIN){ 117 | int pwr; 118 | for (pwr = 0; destor.chunk_avg_size; pwr++) { 119 | destor.chunk_avg_size >>= 1; 120 | } 121 | destor.chunk_avg_size = 1 << (pwr - 1); 122 | 123 | assert(destor.chunk_avg_size >= destor.chunk_min_size); 124 | assert(destor.chunk_avg_size <= destor.chunk_max_size); 125 | assert(destor.chunk_max_size <= CONTAINER_SIZE - CONTAINER_META_SIZE); 126 | 127 | chunkAlg_init(); 128 | chunking = rabin_chunk_data; 129 | }else if(destor.chunk_algorithm == CHUNK_NORMALIZED_RABIN){ 130 | int pwr; 131 | for (pwr = 0; destor.chunk_avg_size; pwr++) { 132 | destor.chunk_avg_size >>= 1; 133 | } 134 | destor.chunk_avg_size = 1 << (pwr - 1); 135 | 136 | assert(destor.chunk_avg_size >= destor.chunk_min_size); 137 | assert(destor.chunk_avg_size <= destor.chunk_max_size); 138 | assert(destor.chunk_max_size <= CONTAINER_SIZE - CONTAINER_META_SIZE); 139 | 140 | chunkAlg_init(); 141 | chunking = normalized_rabin_chunk_data; 142 | }else if(destor.chunk_algorithm == CHUNK_TTTD){ 143 | int pwr; 144 | for (pwr = 0; destor.chunk_avg_size; pwr++) { 145 | destor.chunk_avg_size >>= 1; 146 | } 147 | destor.chunk_avg_size = 1 << (pwr - 1); 148 | 149 | assert(destor.chunk_avg_size >= destor.chunk_min_size); 150 | assert(destor.chunk_avg_size <= destor.chunk_max_size); 151 | assert(destor.chunk_max_size <= CONTAINER_SIZE - CONTAINER_META_SIZE); 152 | 153 | chunkAlg_init(); 154 | chunking = tttd_chunk_data; 155 | }else if(destor.chunk_algorithm == CHUNK_FIXED){ 156 | assert(destor.chunk_avg_size <= CONTAINER_SIZE - CONTAINER_META_SIZE); 157 | 158 | destor.chunk_max_size = destor.chunk_avg_size; 159 | chunking = fixed_chunk_data; 160 | }else if(destor.chunk_algorithm == CHUNK_FILE){ 161 | /* 162 | * approximate file-level deduplication 163 | * It splits the stream according to file boundaries. 164 | * For a larger file, we need to split it due to container size limit. 165 | * Hence, our approximate file-level deduplication are only for files smaller than CONTAINER_SIZE - CONTAINER_META_SIZE. 166 | * Similar to fixed-sized chunking of $(( CONTAINER_SIZE - CONTAINER_META_SIZE )) chunk size. 167 | * */ 168 | destor.chunk_avg_size = CONTAINER_SIZE - CONTAINER_META_SIZE; 169 | destor.chunk_max_size = CONTAINER_SIZE - CONTAINER_META_SIZE; 170 | chunking = fixed_chunk_data; 171 | }else if(destor.chunk_algorithm == CHUNK_AE){ 172 | assert(destor.chunk_avg_size <= destor.chunk_max_size); 173 | assert(destor.chunk_max_size <= CONTAINER_SIZE - CONTAINER_META_SIZE); 174 | 175 | chunking = ae_chunk_data; 176 | ae_init(); 177 | }else{ 178 | NOTICE("Invalid chunking algorithm"); 179 | exit(1); 180 | } 181 | 182 | chunk_queue = sync_queue_new(100); 183 | pthread_create(&chunk_t, NULL, chunk_thread, NULL); 184 | } 185 | 186 | void stop_chunk_phase() { 187 | pthread_join(chunk_t, NULL); 188 | NOTICE("chunk phase stops successfully!"); 189 | } 190 | -------------------------------------------------------------------------------- /scripts/destor.config: -------------------------------------------------------------------------------- 1 | # Specify the working directory of destor. 2 | # All metadata and data locate in the directory. 3 | working-directory "/home/data/working/" 4 | 5 | # Specify the simulation level: NO, RESTORE, APPEND, ALL. 6 | # level 'all' indicates the inputs of destor are trace files generated by 'destor -t' 7 | # other levels indicate the inputs are regular file. 8 | simulation-level no 9 | 10 | # Specify the trace format 11 | # two trace formats are supported: (1) traces generated by destor -t; (2) the FSL traces 12 | # trace-format fsl 13 | trace-format destor 14 | 15 | # Specify the destor verbosity level: debug, verbose, notice, warning. 16 | log-level notice 17 | 18 | # Specify the chunking algorithm. 19 | # It can be rabin, fixed, "normalized rabin", file, ae. 20 | # "file" indicates an approximiate file-level deduplication. 21 | # For example, 22 | # chunk-algorithm "normalized rabin" 23 | # chunk-algorithm file 24 | # chunk-algorithm fixed 25 | # chunk-algorithm tttd 26 | chunk-algorithm rabin 27 | 28 | # Specify the average/maximal/minmal chunk size in bytes. 29 | #chunk-avg-size 4096 30 | chunk-avg-size 4096 31 | chunk-max-size 65536 32 | chunk-min-size 512 33 | 34 | #################################################################################### 35 | # Categories of fingerprint indexes 36 | # ----------------------------------------------------------------------------------- 37 | # - - physical locality - logical locality - 38 | # ----------------------------------------------------------------------------------- 39 | # - exact dedup - DDFS, ChunkStash - Block Locality Caching - 40 | # ----------------------------------------------------------------------------------- 41 | # - near-exact dedup - Sampled Index - Extreme Binning, Sparse Index, SiLo - 42 | # ----------------------------------------------------------------------------------- 43 | #################################################################################### 44 | 45 | # Specify the fingerprint index. 46 | # It can be ddfs, silo, "extreme binning", "sparse index", 47 | # "sampled index", "block locality caching". 48 | # For example, 49 | # fingerprint-index near-exact logical "extreme binning" 50 | # fingerprint-index near-exact logical "sparse index" 51 | # fingerprint-index exact logical "block locality caching" 52 | # (The fourth arg specifies a combo) 53 | # fingerprint-index near-exact physical "sampled index" 54 | fingerprint-index exact physical 55 | 56 | # Specify the key-value store 57 | fingerprint-index-key-value htable 58 | 59 | # Specify the fingerprint cache size 60 | # in the size of container (only metadata part) or segmentRecipe. 61 | fingerprint-index-cache-size 128 62 | 63 | # Specify the sampling method and ratio. 64 | # For example, 65 | # fingerprint-index-sampling-method min 0 66 | # (min 0 indicates select a feature per segment or container.) 67 | # fingerprint-index-sampling-method min 128 68 | # fingerprint-index-sampling-method optmin 128 69 | # fingerprint-index-sampling-method random 128 70 | # fingerprint-index-sampling-method uniform 64 71 | fingerprint-index-sampling-method random 128 72 | 73 | # Configure the key-value store 74 | # The size of a feature in byte 75 | fingerprint-index-key-size 20 76 | # Specify the maximum number of segments a feature can refer to. 77 | fingerprint-index-value-length 1 78 | 79 | ######################################################################### 80 | # Design elements of fingerprint indexes exploiting similarity 81 | # -------------------------------------------------------------------------------------- 82 | # - - Extreme Binning - Sparse Index - SiLo - BLC - 83 | # - segment algorithm - file-defined - content-defined - fixed - fixed - 84 | # - sampling method - min - random - min - (no) - 85 | # - segment selection - all - top-k - top-1 - base - 86 | # - segment prefetching - no - no - yes - yes - 87 | # -------------------------------------------------------------------------------------- 88 | ######################################################################### 89 | 90 | # Specify the segment algorithm and the segment size. Such as 91 | # fingerprint-index-segment-algorithm file-defined 92 | # fingerprint-index-segment-algorithm content-defined 512 93 | # fingerprint-index-segment-algorithm fixed 1024 94 | fingerprint-index-segment-algorithm content-defined 1024 95 | 96 | # Specify the min and max length of a segment, in terms of # of chunks. 97 | fingerprint-index-segment-boundary 128 10240 98 | 99 | # Specify the method of segment selection 100 | # for example, 101 | # fingerprint-index-segment-selection top 2 102 | # fingerprint-index-segment-selection base 103 | # fingerprint-index-segment-selection mix 104 | fingerprint-index-segment-selection base 105 | 106 | # Specify the number of prefetched segments. 107 | # 1 indicates only prefetching the required segment. 108 | fingerprint-index-segment-prefetching 1 109 | 110 | # Specify the rewriting algorithm and the size of rewrite buffer. 111 | # It can be "no", "cfl-based selective deduplication" (or "cfl" in short), 112 | # "context-based rewriting" (or cbr), "capping" (or cap). 113 | # For example, 114 | # rewrite-algorithm cbr 2048 115 | rewrite-algorithm no 116 | 117 | # Enable/disable the cfl switch. 118 | # If the cfl switch is enable, 119 | # destor will monitor the CFL metric in real time, 120 | # and stop rewriting out-of-order chunks if the CFL is high. 121 | rewrite-enable-cfl-switch no 122 | 123 | # Specify cfl-require in cfl. 124 | rewrite-cfl-require 0.6 125 | 126 | # Specify cfl-usage-threshold in cfl. 127 | rewrite-cfl-usage-threshold 0.03 128 | 129 | # Specify rewrite limit in cbr. 130 | rewrite-cbr-limit 0.05 131 | 132 | # Specify minimal utility in cbr. 133 | rewrite-cbr-minimal-utility 0.5 134 | 135 | # Specify capping level in capping. 136 | # The original definition of capping level (in the paper) is defined as rewrite-capping-level:rewrite buffer size. 137 | rewrite-capping-level 14 138 | 139 | # Enable History-Aware Rewriting (HAR), 140 | # rewrite-enable-har yes 141 | rewrite-enable-har no 142 | 143 | # Specify the utilization threshold of HAR. 144 | rewrite-har-utilization-threshold 0.5 145 | # Specify the rewrite limit of har. 146 | # rewrite-har-rewrite-limit 0.05 147 | rewrite-har-rewrite-limit 0.05 148 | 149 | # Enable Cache-Aware Filter. 150 | rewrite-enable-cache-aware no 151 | 152 | # Specify the restore cache, 153 | # and the size of the restore cache in the number of containers. 154 | # It can be lru, "optimal cache" (or opt), and "forward assembly" (or asm). 155 | restore-cache lru 30 156 | 157 | # Specify the window size of the optimal restore cache. 158 | restore-opt-window-size 1000000 159 | 160 | # Configuration of the maximum number of backups retained at any moment. 161 | # The expired backup is deleted automatically. 162 | # A value of 500 indicates only the recent 200 backups are retained. 163 | # A negative value, such as -1, indicates all backups are retained. 164 | backup-retention-time -1 165 | -------------------------------------------------------------------------------- /src/do_backup.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "utils/sync_queue.h" 4 | #include "index/index.h" 5 | #include "backup.h" 6 | #include "storage/containerstore.h" 7 | 8 | /* defined in index.c */ 9 | extern struct { 10 | /* Requests to the key-value store */ 11 | int lookup_requests; 12 | int update_requests; 13 | int lookup_requests_for_unique; 14 | /* Overheads of prefetching module */ 15 | int read_prefetching_units; 16 | }index_overhead; 17 | 18 | void do_backup(char *path) { 19 | 20 | init_recipe_store(); 21 | init_container_store(); 22 | init_index(); 23 | 24 | init_backup_jcr(path); 25 | 26 | puts("==== backup begin ===="); 27 | 28 | TIMER_DECLARE(1); 29 | TIMER_BEGIN(1); 30 | 31 | time_t start = time(NULL); 32 | if (destor.simulation_level == SIMULATION_ALL) { 33 | start_read_trace_phase(); 34 | } else { 35 | start_read_phase(); 36 | start_chunk_phase(); 37 | start_hash_phase(); 38 | } 39 | start_dedup_phase(); 40 | start_rewrite_phase(); 41 | start_filter_phase(); 42 | 43 | do{ 44 | sleep(5); 45 | /*time_t now = time(NULL);*/ 46 | fprintf(stderr,"job %" PRId32 ", %" PRId64 " bytes, %" PRId32 " chunks, %d files processed\r", 47 | jcr.id, jcr.data_size, jcr.chunk_num, jcr.file_num); 48 | }while(jcr.status == JCR_STATUS_RUNNING || jcr.status != JCR_STATUS_DONE); 49 | fprintf(stderr,"job %" PRId32 ", %" PRId64 " bytes, %" PRId32 " chunks, %d files processed\n", 50 | jcr.id, jcr.data_size, jcr.chunk_num, jcr.file_num); 51 | 52 | if (destor.simulation_level == SIMULATION_ALL) { 53 | stop_read_trace_phase(); 54 | } else { 55 | stop_read_phase(); 56 | stop_chunk_phase(); 57 | stop_hash_phase(); 58 | } 59 | stop_dedup_phase(); 60 | stop_rewrite_phase(); 61 | stop_filter_phase(); 62 | 63 | TIMER_END(1, jcr.total_time); 64 | 65 | close_index(); 66 | close_container_store(); 67 | close_recipe_store(); 68 | 69 | update_backup_version(jcr.bv); 70 | 71 | free_backup_version(jcr.bv); 72 | 73 | puts("==== backup end ===="); 74 | 75 | printf("job id: %" PRId32 "\n", jcr.id); 76 | printf("backup path: %s\n", jcr.path); 77 | printf("number of files: %d\n", jcr.file_num); 78 | printf("number of chunks: %" PRId32 " (%" PRId64 " bytes on average)\n", jcr.chunk_num, 79 | jcr.data_size / jcr.chunk_num); 80 | printf("number of unique chunks: %" PRId32 "\n", jcr.unique_chunk_num); 81 | printf("total size(B): %" PRId64 "\n", jcr.data_size); 82 | printf("stored data size(B): %" PRId64 "\n", 83 | jcr.unique_data_size + jcr.rewritten_chunk_size); 84 | printf("deduplication ratio: %.4f, %.4f\n", 85 | jcr.data_size != 0 ? 86 | (jcr.data_size - jcr.unique_data_size 87 | - jcr.rewritten_chunk_size) 88 | / (double) (jcr.data_size) : 89 | 0, 90 | jcr.data_size 91 | / (double) (jcr.unique_data_size + jcr.rewritten_chunk_size)); 92 | printf("total time(s): %.3f\n", jcr.total_time / 1000000); 93 | printf("throughput(MB/s): %.2f\n", 94 | (double) jcr.data_size * 1000000 / (1024 * 1024 * jcr.total_time)); 95 | printf("number of zero chunks: %" PRId32 "\n", jcr.zero_chunk_num); 96 | printf("size of zero chunks: %" PRId64 "\n", jcr.zero_chunk_size); 97 | printf("number of rewritten chunks: %" PRId32 "\n", jcr.rewritten_chunk_num); 98 | printf("size of rewritten chunks: %" PRId64 "\n", jcr.rewritten_chunk_size); 99 | printf("rewritten rate in size: %.3f\n", 100 | jcr.rewritten_chunk_size / (double) jcr.data_size); 101 | 102 | destor.data_size += jcr.data_size; 103 | destor.stored_data_size += jcr.unique_data_size + jcr.rewritten_chunk_size; 104 | 105 | destor.chunk_num += jcr.chunk_num; 106 | destor.stored_chunk_num += jcr.unique_chunk_num + jcr.rewritten_chunk_num; 107 | destor.zero_chunk_num += jcr.zero_chunk_num; 108 | destor.zero_chunk_size += jcr.zero_chunk_size; 109 | destor.rewritten_chunk_num += jcr.rewritten_chunk_num; 110 | destor.rewritten_chunk_size += jcr.rewritten_chunk_size; 111 | 112 | printf("read_time : %.3fs, %.2fMB/s\n", jcr.read_time / 1000000, 113 | jcr.data_size * 1000000 / jcr.read_time / 1024 / 1024); 114 | printf("chunk_time : %.3fs, %.2fMB/s\n", jcr.chunk_time / 1000000, 115 | jcr.data_size * 1000000 / jcr.chunk_time / 1024 / 1024); 116 | printf("hash_time : %.3fs, %.2fMB/s\n", jcr.hash_time / 1000000, 117 | jcr.data_size * 1000000 / jcr.hash_time / 1024 / 1024); 118 | 119 | printf("dedup_time : %.3fs, %.2fMB/s\n", 120 | jcr.dedup_time / 1000000, 121 | jcr.data_size * 1000000 / jcr.dedup_time / 1024 / 1024); 122 | 123 | printf("rewrite_time : %.3fs, %.2fMB/s\n", jcr.rewrite_time / 1000000, 124 | jcr.data_size * 1000000 / jcr.rewrite_time / 1024 / 1024); 125 | 126 | printf("filter_time : %.3fs, %.2fMB/s\n", 127 | jcr.filter_time / 1000000, 128 | jcr.data_size * 1000000 / jcr.filter_time / 1024 / 1024); 129 | 130 | printf("write_time : %.3fs, %.2fMB/s\n", jcr.write_time / 1000000, 131 | jcr.data_size * 1000000 / jcr.write_time / 1024 / 1024); 132 | 133 | //double seek_time = 0.005; //5ms 134 | //double bandwidth = 120 * 1024 * 1024; //120MB/s 135 | 136 | /* double index_lookup_throughput = jcr.data_size 137 | / (index_read_times * seek_time 138 | + index_read_entry_counter * 24 / bandwidth) / 1024 / 1024; 139 | 140 | double write_data_throughput = 1.0 * jcr.data_size * bandwidth 141 | / (jcr->unique_chunk_num) / 1024 / 1024; 142 | double index_read_throughput = 1.0 * jcr.data_size / 1024 / 1024 143 | / (index_read_times * seek_time 144 | + index_read_entry_counter * 24 / bandwidth); 145 | double index_write_throughput = 1.0 * jcr.data_size / 1024 / 1024 146 | / (index_write_times * seek_time 147 | + index_write_entry_counter * 24 / bandwidth);*/ 148 | 149 | /* double estimated_throughput = write_data_throughput; 150 | if (estimated_throughput > index_read_throughput) 151 | estimated_throughput = index_read_throughput;*/ 152 | /*if (estimated_throughput > index_write_throughput) 153 | estimated_throughput = index_write_throughput;*/ 154 | 155 | char logfile[] = "backup.log"; 156 | FILE *fp = fopen(logfile, "a"); 157 | /* 158 | * job id, 159 | * the size of backup 160 | * accumulative consumed capacity, 161 | * deduplication rate, 162 | * rewritten rate, 163 | * total container number, 164 | * sparse container number, 165 | * inherited container number, 166 | * 4 * index overhead (4 * int) 167 | * throughput, 168 | */ 169 | fprintf(fp, "%" PRId32 " %" PRId64 " %" PRId64 " %.4f %.4f %" PRId32 " %" PRId32 " %" PRId32 " %" PRId32" %" PRId32 " %" PRId32" %" PRId32" %.2f\n", 170 | jcr.id, 171 | jcr.data_size, 172 | destor.stored_data_size, 173 | jcr.data_size != 0 ? 174 | (jcr.data_size - jcr.rewritten_chunk_size - jcr.unique_data_size)/(double) (jcr.data_size) 175 | : 0, 176 | jcr.data_size != 0 ? (double) (jcr.rewritten_chunk_size) / (double) (jcr.data_size) : 0, 177 | jcr.total_container_num, 178 | jcr.sparse_container_num, 179 | jcr.inherited_sparse_num, 180 | index_overhead.lookup_requests, 181 | index_overhead.lookup_requests_for_unique, 182 | index_overhead.update_requests, 183 | index_overhead.read_prefetching_units, 184 | (double) jcr.data_size * 1000000 / (1024 * 1024 * jcr.total_time)); 185 | 186 | fclose(fp); 187 | 188 | } 189 | -------------------------------------------------------------------------------- /destor.config: -------------------------------------------------------------------------------- 1 | # Specify the working directory of destor. 2 | # All metadata and data locate in the directory. 3 | working-directory "/home/data/working/" 4 | 5 | # Specify the simulation level: NO, RESTORE, APPEND, ALL. 6 | # level 'all' indicates the inputs of destor are trace files generated by 'destor -t' 7 | # other levels indicate the inputs are regular file. 8 | simulation-level no 9 | 10 | # Specify the trace format 11 | # two trace formats are supported: (1) traces generated by destor -t; (2) the FSL traces 12 | # trace-format fsl 13 | trace-format destor 14 | 15 | # Specify the destor verbosity level: debug, verbose, notice, warning. 16 | log-level notice 17 | 18 | # Specify the chunking algorithm. 19 | # It can be rabin, fixed, "normalized rabin", tttd, file, and ae. 20 | # "file" indicates an approximiate file-level deduplication. 21 | # For example, 22 | # chunk-algorithm "normalized rabin" 23 | # chunk-algorithm file 24 | # chunk-algorithm fixed 25 | # chunk-algorithm tttd 26 | # chunk-algorithm ae 27 | chunk-algorithm rabin 28 | 29 | # Specify the average/maximal/minmal chunk size in bytes. 30 | #chunk-avg-size 4096 31 | chunk-avg-size 4096 32 | chunk-max-size 65536 33 | chunk-min-size 512 34 | 35 | #################################################################################### 36 | # Categories of fingerprint indexes 37 | # ----------------------------------------------------------------------------------- 38 | # - - physical locality - logical locality - 39 | # ----------------------------------------------------------------------------------- 40 | # - exact dedup - DDFS, ChunkStash - Block Locality Caching - 41 | # ----------------------------------------------------------------------------------- 42 | # - near-exact dedup - Sampled Index - Extreme Binning, Sparse Index, SiLo - 43 | # ----------------------------------------------------------------------------------- 44 | #################################################################################### 45 | 46 | # Specify the fingerprint index. 47 | # It can be ddfs, silo, "extreme binning", "sparse index", 48 | # "sampled index", "block locality caching". 49 | # For example, 50 | # fingerprint-index near-exact logical "extreme binning" 51 | # fingerprint-index near-exact logical "sparse index" 52 | # fingerprint-index exact logical "block locality caching" 53 | # (The fourth arg specifies a combo) 54 | # fingerprint-index near-exact physical "sampled index" 55 | fingerprint-index exact physical 56 | 57 | # Specify the key-value store 58 | fingerprint-index-key-value htable 59 | 60 | # Specify the fingerprint cache size 61 | # in the size of container (only metadata part) or segmentRecipe. 62 | fingerprint-index-cache-size 128 63 | 64 | # Specify the sampling method and ratio. 65 | # For example, 66 | # fingerprint-index-sampling-method min 0 67 | # (min 0 indicates select a feature per segment or container.) 68 | # fingerprint-index-sampling-method min 128 69 | # fingerprint-index-sampling-method optmin 128 70 | # fingerprint-index-sampling-method random 128 71 | # fingerprint-index-sampling-method uniform 64 72 | fingerprint-index-sampling-method random 128 73 | 74 | # Configure the key-value store 75 | # The size of a feature in byte 76 | fingerprint-index-key-size 20 77 | # Specify the maximum number of segments a feature can refer to. 78 | fingerprint-index-value-length 1 79 | 80 | ######################################################################### 81 | # Design elements of fingerprint indexes exploiting similarity 82 | # -------------------------------------------------------------------------------------- 83 | # - - Extreme Binning - Sparse Index - SiLo - BLC - 84 | # - segment algorithm - file-defined - content-defined - fixed - fixed - 85 | # - sampling method - min - random - min - (no) - 86 | # - segment selection - all - top-k - top-1 - base - 87 | # - segment prefetching - no - no - yes - yes - 88 | # -------------------------------------------------------------------------------------- 89 | ######################################################################### 90 | 91 | # Specify the segment algorithm and the segment size. Such as 92 | # fingerprint-index-segment-algorithm file-defined 93 | # fingerprint-index-segment-algorithm content-defined 512 94 | # fingerprint-index-segment-algorithm fixed 1024 95 | fingerprint-index-segment-algorithm content-defined 1024 96 | 97 | # Specify the min and max length of a segment, in terms of # of chunks. 98 | fingerprint-index-segment-boundary 128 10240 99 | 100 | # Specify the method of segment selection 101 | # for example, 102 | # fingerprint-index-segment-selection top 2 103 | # fingerprint-index-segment-selection base 104 | # fingerprint-index-segment-selection mix 105 | fingerprint-index-segment-selection base 106 | 107 | # Specify the number of prefetched segments. 108 | # 1 indicates only prefetching the required segment. 109 | fingerprint-index-segment-prefetching 1 110 | 111 | # Specify the rewriting algorithm and the size of rewrite buffer. 112 | # It can be "no", "cfl-based selective deduplication" (or "cfl" in short), 113 | # "context-based rewriting" (or cbr), "capping" (or cap). 114 | # For example, 115 | # rewrite-algorithm cbr 2048 116 | rewrite-algorithm no 117 | 118 | # Enable/disable the cfl switch. 119 | # If the cfl switch is enable, 120 | # destor will monitor the CFL metric in real time, 121 | # and stop rewriting out-of-order chunks if the CFL is high. 122 | rewrite-enable-cfl-switch no 123 | 124 | # Specify cfl-require in cfl. 125 | rewrite-cfl-require 0.6 126 | 127 | # Specify cfl-usage-threshold in cfl. 128 | rewrite-cfl-usage-threshold 0.03 129 | 130 | # Specify rewrite limit in cbr. 131 | rewrite-cbr-limit 0.05 132 | 133 | # Specify minimal utility in cbr. 134 | rewrite-cbr-minimal-utility 0.5 135 | 136 | # Specify capping level in capping. 137 | # The original definition of capping level (in the paper) is defined as rewrite-capping-level:rewrite buffer size. 138 | rewrite-capping-level 14 139 | 140 | # Enable History-Aware Rewriting (HAR), 141 | # rewrite-enable-har yes 142 | rewrite-enable-har no 143 | 144 | # Specify the utilization threshold of HAR. 145 | rewrite-har-utilization-threshold 0.5 146 | # Specify the rewrite limit of har. 147 | # rewrite-har-rewrite-limit 0.05 148 | rewrite-har-rewrite-limit 0.05 149 | 150 | # Enable Cache-Aware Filter. 151 | rewrite-enable-cache-aware no 152 | 153 | # Specify the restore cache, 154 | # and the size of the restore cache in the number of containers. 155 | # It can be lru, "optimal cache" (or opt), and "forward assembly" (or asm). 156 | restore-cache lru 30 157 | 158 | # Specify the window size of the optimal restore cache. 159 | restore-opt-window-size 1000000 160 | 161 | # Configuration of the maximum number of backups retained at any moment. 162 | # The expired backup is deleted automatically. 163 | # A value of 500 indicates only the recent 200 backups are retained. 164 | # A negative value, such as -1, indicates all backups are retained. 165 | backup-retention-time -1 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | General Information 2 | ------------------- 3 | Destor is a platform for data deduplication evaluation. 4 | 5 | Features 6 | -------- 7 | 1. Container-based storage; 8 | 2. Chunk-level pipeline; 9 | 3. Fixed-sized chunking, Content-Defined Chunking (CDC) and an approximate file-level deduplication; 10 | 4. A variety of fingerprint indexes, including DDFS, Extreme Binning, Sparse Index, SiLo, etc. 11 | 5. A variety of rewriting algorithms, including CFL, CBR, CAP, HAR etc. 12 | 6. A variety of restore algorithms, including LRU, optimal replacement algorithm, rolling forward assembly. 13 | 14 | Related papers 15 | -------------- 16 | 1. The chunking algorithm: 17 | > a) A Low-bandwidth Network File System @ SOSP'02. 18 | > 19 | > b) A Framework for Analyzing and Improving Content-Based Chunking Algorithms @ HP technical report. 20 | > 21 | > c) AE: An Asymmetric Extremum Content Defined Chunking Algorithm for Fast and Bandwidth-Efficient Data Deduplication @ IEEE Infocom'15. 22 | 23 | 2. The fingerprint index: 24 | > a) Avoiding the Disk Bottleneck in the Data Domain Deduplication File System @ FAST'08. 25 | > 26 | > b) Sparse Indexing: Large Scale, Inline Deduplication Using Sampling and Locality @ FAST'09. 27 | > 28 | > c) Extreme Binning: Scalable, Parallel Deduplication for Chunk-based File Backup @ MASCOTS'09. 29 | > 30 | > d) SiLo: A Similarity-Locality based Near-Exact Deduplicatin Scheme with Low RAM Overhead and High Throughput @ USENIX ATC'11. 31 | > 32 | > e) Building a High-Performance Deduplication System @ USENIX ATC'11. 33 | > 34 | > f) Block Locality Caching for Data Deduplication @ SYSTOR'13. 35 | > 36 | > g) The design of a similarity based deduplication system @ SYSTOR'09. 37 | 38 | 3. The fragmentation: 39 | > a) Chunk Fragmentation Level: An Effective Indicator for Read Performance Degradation in Deduplication Storage @ HPCC'11. 40 | > 41 | > b) Assuring Demanded Read Performance of Data Deduplication Storage with Backup Datasets @ MASCOTS'12. 42 | > 43 | > c) Reducing impact of data fragmentation caused by in-line deduplication @ SYSTOR'12. 44 | > 45 | > d) Improving Restore Speed for Backup Systems that Use Inline Chunk-Based Deduplication @ FAST'13. 46 | > 47 | > e) Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information @ USENIX ATC'14. 48 | > 49 | > f) Reducing Fragmentation for In-line Deduplication Backup Storage via Exploiting Historical Information and Cache Knowledge @ IEEE TPDS. 50 | 51 | 4. The restore algorithm: 52 | > a) A Study of Replacement Algorithms for a Virtual-Storage Computer @ IBM Systems Journal'1966. 53 | > 54 | > b) Improving Restore Speed for Backup Systems that Use Inline Chunk-Based Deduplication @ FAST'13. 55 | > 56 | > c) Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information @ USENIX ATC'14. 57 | 58 | 5. Garbage collection: 59 | > a) Building a High-Performance Deduplication System @ USENIX ATC'11. 60 | > 61 | > b) Cumulus: Filesystem Backup to the Cloud @ FAST'09. 62 | > 63 | > c) Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information @ USENIX ATC'14. 64 | 65 | The Destor paper 66 | ---------------------- 67 | **[FAST'15] Design Tradeoffs for Data Deduplication Performance in Backup Workloads.** 68 | 69 | This paper presents the parameter space in data deduplication that guides the design of Destor. 70 | It then gives the overall architecture and the backup/restore pipeline in Destor. 71 | Finally, we did an entensive experimentation via Destor to find reasonable solutions. 72 | You can find the paper in doc directory. 73 | 74 | Recent publications using Destor 75 | ----------------------------- 76 | 1. **Min Fu**, Dan Feng, Yu Hua, Xubin He, Zuoning Chen, Wen Xia, Fangting Huang, and Qing Liu. *Accelerating Restore and Garbage Collection in Deduplication-based Backup Systems via Exploiting Historical Information* @ USENIX ATC'14. 77 | 2. Jian Liu, Yunpeng Chai, Xiao Qin, and Yuan Xiao. *PLC-Cache: Endurable SSD Cache for Deduplication-based Primary Storage* @ MSST'14. 78 | 3. Yucheng Zhang et al. *AE: An Asymmetric Extremum Content Defined Chunking Algorithm for Fast and Bandwidth-Efficient Data Deduplication* @ IEEE Infocom'15. 79 | 4. **Min Fu** et al. *Reducing Fragmentation for In-line Deduplication Backup Storage via Exploiting Historical Information and Cache Knowledge* @ IEEE TPDS. 80 | 81 | 82 | Environment 83 | ----------- 84 | Linux 64bit. 85 | 86 | Dependencies 87 | ------------ 88 | 1. libssl-dev is required to calculate sha-1 digest; 89 | 2. GLib 2.32 or later version 90 | 91 | > libglib.so and glib.h may not be found when you first install it. 92 | 93 | > The header files (that originally locate in /usr/local/include/glib-2.0 and /usr/local/lib/glib-2.0/include) are required to be moved to a searchable path, such as "/usr/local/include". 94 | 95 | > Also a link named libglib.so should be created in "/usr/local/lib". 96 | 97 | 3. Makefile is automatically generated by GNU autoconf and automake. 98 | 99 | INSTALL 100 | ------- 101 | If all dependencies are installed, 102 | compiling destor is straightforward: 103 | 104 | >./configure 105 | > 106 | >make 107 | > 108 | >make install 109 | 110 | To uninstall destor, run 111 | 112 | >make uninstall 113 | 114 | Running 115 | ------- 116 | If compile and install are successful, the executable file, destor, should have been moved to /usr/local/bin by default. 117 | You can create a config file, destor.config, in where you run destor. 118 | A sample destor.config is in the project directory. 119 | 120 | NOTE: run **rebuild** script before destor to prepare working directory and clear data. 121 | 122 | destor can run as follows: 123 | 124 | 1. start a backup task, 125 | > destor /path/to/data -p"a line as in config file" 126 | 127 | 2. start a restore task, 128 | > destor -r /path/to/restore -p"a line as in config file" 129 | 130 | 3. show the current statistics of system, 131 | > destor -s 132 | 133 | 4. show help 134 | > destor -h 135 | 136 | 5. make a trace 137 | > destor -t /path/to/data 138 | 139 | Configuration 140 | ------------- 141 | A sample configuration is shown in destor.conf 142 | 143 | To find what the parameters in destor.conf exactly mean and how to configure an existing solution (such as DDFS), please read the paper **Design Tradeoffs for Data Deduplication Performance in Backup Workloads** in doc/. 144 | The parameter space is based on the taxonomy proposed in the paper. 145 | (Note: The paper is somewhat difficult to follow. I am sorry about that, still working on improving the readability.) 146 | 147 | Bugs 148 | ---- 149 | 1. If the running destor is crashed artificially or unexpectedly, data consistency is not guaranted and you'd better run rebuild script. 150 | 151 | 2. Do NOT support concurrent backup/restore. 152 | 153 | 3. If working path in destor.config is modified, the rebuild script must be modified too. 154 | 155 | 4. CMA assumes the backups are deleted in FIFO order. 156 | > If you set a backup-retention-time, the expired backup is deleted automatically. 157 | 158 | Author 159 | ------ 160 | Min Fu 161 | 162 | Email : fumin at hust.edu.cn 163 | 164 | Blog : fumin.hustbackup.cn 165 | 166 | (Feel free to contact me if you have any questions about Destor. 167 | I would appreciate bug report.) 168 | -------------------------------------------------------------------------------- /src/index/similarity_detection.c: -------------------------------------------------------------------------------- 1 | /* 2 | * similarity_detection.c 3 | * 4 | * Created on: Mar 25, 2014 5 | * Author: fumin 6 | */ 7 | #include "index_buffer.h" 8 | #include "kvstore.h" 9 | #include "fingerprint_cache.h" 10 | #include "../recipe/recipestore.h" 11 | #include "../storage/containerstore.h" 12 | #include "../jcr.h" 13 | 14 | extern struct index_overhead index_overhead; 15 | 16 | extern struct index_buffer index_buffer; 17 | 18 | /* 19 | * Larger one comes before smaller one. 20 | * Descending order. 21 | */ 22 | static gint g_segment_cmp_feature_num(struct segment* a, 23 | struct segment* b, gpointer user_data) { 24 | gint ret = g_hash_table_size(b->features) - g_hash_table_size(a->features); 25 | if (ret == 0) { 26 | ret = b->id > a->id ? 1 : -1; 27 | return ret; 28 | } else 29 | return ret; 30 | } 31 | 32 | /* 33 | * Remove the features that are common with top from target. 34 | */ 35 | static void features_trim(struct segment *target, 36 | struct segment *top) { 37 | GHashTableIter iter; 38 | gpointer key, value; 39 | g_hash_table_iter_init(&iter, top->features); 40 | while (g_hash_table_iter_next(&iter, &key, &value)) { 41 | g_hash_table_remove(target->features, key); 42 | } 43 | } 44 | 45 | /* 46 | * Select the top segments that are most similar with features. 47 | * (top-k * prefetching_num) cannot be larger than the segment cache size. 48 | */ 49 | static void top_segment_select(GHashTable* features) { 50 | /* 51 | * Mapping segment IDs to similar segments that hold at least one of features. 52 | */ 53 | GHashTable *similar_segments = g_hash_table_new_full(g_int64_hash, g_int64_equal, NULL, 54 | free_segment); 55 | 56 | GHashTableIter iter; 57 | gpointer key, value; 58 | g_hash_table_iter_init(&iter, features); 59 | /* Iterate the features of the segment. */ 60 | while (g_hash_table_iter_next(&iter, &key, &value)) { 61 | /* Each feature is mapped to several segment IDs. */ 62 | segmentid *ids = kvstore_lookup((fingerprint*) key); 63 | if (ids) { 64 | index_overhead.lookup_requests++; 65 | int i; 66 | for (i = 0; i < destor.index_value_length; i++) { 67 | if (ids[i] == TEMPORARY_ID) 68 | break; 69 | struct segment* s = g_hash_table_lookup(similar_segments, &ids[i]); 70 | if (!s) { 71 | s = new_segment_full(); 72 | s->id = ids[i]; 73 | g_hash_table_insert(similar_segments, &s->id, s); 74 | } 75 | char* feature = malloc(destor.index_key_size); 76 | memcpy(feature, key, destor.index_key_size); 77 | assert(!g_hash_table_contains(s->features, feature)); 78 | g_hash_table_insert(s->features, feature, NULL); 79 | } 80 | }else{ 81 | index_overhead.lookup_requests_for_unique++; 82 | } 83 | } 84 | 85 | if (g_hash_table_size(similar_segments) != 0) { 86 | 87 | /* Sorting similar segments in order of their number of hit features. */ 88 | GSequence *seq = g_sequence_new(NULL); 89 | GHashTableIter iter; 90 | gpointer key, value; 91 | g_hash_table_iter_init(&iter, similar_segments); 92 | while (g_hash_table_iter_next(&iter, &key, &value)) { 93 | /* Insert similar segments into GSequence. */ 94 | struct segment* s = value; 95 | NOTICE("candidate segment %lld with %d shared features", s->id, 96 | g_hash_table_size(s->features)); 97 | g_sequence_insert_sorted(seq, s, g_segment_cmp_feature_num, NULL); 98 | } 99 | 100 | /* The number of selected similar segments */ 101 | int num = g_sequence_get_length(seq) 102 | > destor.index_segment_selection_method[1] ? 103 | destor.index_segment_selection_method[1] : 104 | g_sequence_get_length(seq), i; 105 | 106 | NOTICE("select Top-%d in %d segments", num, g_sequence_get_length(seq)); 107 | 108 | /* Prefetched top similar segments are pushed into the queue. */ 109 | for (i = 0; i < num; i++) { 110 | /* Get the top segment */ 111 | struct segment *top = g_sequence_get( 112 | g_sequence_get_begin_iter(seq)); 113 | NOTICE("read segment %lld", top->id); 114 | 115 | fingerprint_cache_prefetch(top->id); 116 | 117 | g_sequence_remove(g_sequence_get_begin_iter(seq)); 118 | g_sequence_foreach(seq, features_trim, top); 119 | g_sequence_sort(seq, g_segment_cmp_feature_num, NULL); 120 | } 121 | g_sequence_free(seq); 122 | 123 | } 124 | 125 | g_hash_table_destroy(similar_segments); 126 | } 127 | 128 | extern struct{ 129 | /* accessed in dedup phase */ 130 | struct container *container_buffer; 131 | /* In order to facilitate sampling in container, 132 | * we keep a queue for chunks in container buffer. */ 133 | GSequence *chunks; 134 | } storage_buffer; 135 | 136 | void index_lookup_similarity_detection(struct segment *s){ 137 | assert(s->features); 138 | top_segment_select(s->features); 139 | 140 | GSequenceIter *iter = g_sequence_get_begin_iter(s->chunks); 141 | GSequenceIter *end = g_sequence_get_end_iter(s->chunks); 142 | for (; iter != end; iter = g_sequence_iter_next(iter)) { 143 | struct chunk* c = g_sequence_get(iter); 144 | 145 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) 146 | continue; 147 | 148 | /* First check it in the storage buffer */ 149 | if(storage_buffer.container_buffer 150 | && lookup_fingerprint_in_container(storage_buffer.container_buffer, &c->fp)){ 151 | c->id = get_container_id(storage_buffer.container_buffer); 152 | SET_CHUNK(c, CHUNK_DUPLICATE); 153 | SET_CHUNK(c, CHUNK_REWRITE_DENIED); 154 | } 155 | /* 156 | * First check the buffered fingerprints, 157 | * recently backup fingerprints. 158 | */ 159 | GQueue *tq = g_hash_table_lookup(index_buffer.buffered_fingerprints, &c->fp); 160 | if (!tq) { 161 | tq = g_queue_new(); 162 | } else if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { 163 | struct indexElem *be = g_queue_peek_head(tq); 164 | c->id = be->id; 165 | SET_CHUNK(c, CHUNK_DUPLICATE); 166 | } 167 | 168 | /* Check the fingerprint cache */ 169 | if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { 170 | /* Searching in fingerprint cache */ 171 | int64_t id = fingerprint_cache_lookup(&c->fp); 172 | if(id != TEMPORARY_ID){ 173 | c->id = id; 174 | SET_CHUNK(c, CHUNK_DUPLICATE); 175 | } 176 | } 177 | 178 | if(destor.index_category[0] == INDEX_CATEGORY_EXACT 179 | || destor.index_segment_selection_method[0] == INDEX_SEGMENT_SELECT_MIX){ 180 | if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { 181 | /* Searching in key-value store */ 182 | int64_t* ids = kvstore_lookup((char*)&c->fp); 183 | if(ids){ 184 | index_overhead.lookup_requests++; 185 | /* prefetch the target unit */ 186 | fingerprint_cache_prefetch(ids[0]); 187 | int64_t id = fingerprint_cache_lookup(&c->fp); 188 | if(id != TEMPORARY_ID){ 189 | /* 190 | * It can be not cached, 191 | * since a partial key is possible in near-exact deduplication. 192 | */ 193 | c->id = id; 194 | SET_CHUNK(c, CHUNK_DUPLICATE); 195 | }else{ 196 | NOTICE("Filter phase: A key collision occurs"); 197 | } 198 | }else{ 199 | index_overhead.lookup_requests_for_unique++; 200 | VERBOSE("Dedup phase: non-existing fingerprint"); 201 | } 202 | } 203 | } 204 | 205 | /* Insert it into the index buffer */ 206 | struct indexElem *ne = (struct indexElem*) malloc( 207 | sizeof(struct indexElem)); 208 | ne->id = c->id; 209 | memcpy(&ne->fp, &c->fp, sizeof(fingerprint)); 210 | 211 | g_queue_push_tail(tq, ne); 212 | g_hash_table_replace(index_buffer.buffered_fingerprints, &ne->fp, tq); 213 | 214 | index_buffer.chunk_num++; 215 | } 216 | 217 | } 218 | -------------------------------------------------------------------------------- /src/utils/bloom_filter.c: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * 3 | * bloom_filter.c -- A simple bloom filter implementation 4 | * by timdoug -- timdoug@gmail.com -- http://www.timdoug.com -- 2008-07-05 5 | * see http://en.wikipedia.org/wiki/Bloom_filter 6 | * 7 | ******************************************************************************** 8 | * 9 | * Copyright (c) 2008, timdoug(@gmail.com) -- except for the hash functions 10 | * All rights reserved. 11 | * 12 | * Redistribution and use in source and binary forms, with or without 13 | * modification, are permitted provided that the following conditions are met: 14 | * * Redistributions of source code must retain the above copyright 15 | * notice, this list of conditions and the following disclaimer. 16 | * * Redistributions in binary form must reproduce the above copyright 17 | * notice, this list of conditions and the following disclaimer in the 18 | * documentation and/or other materials provided with the distribution. 19 | * * The name of the author may not be used to endorse or promote products 20 | * derived from this software without specific prior written permission. 21 | * 22 | * THIS SOFTWARE IS PROVIDED BY timdoug ``AS IS'' AND ANY 23 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 24 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | * DISCLAIMED. IN NO EVENT SHALL timdoug BE LIABLE FOR ANY 26 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 27 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 29 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | * 33 | ******************************************************************************** 34 | * 35 | * compile with (note -- ints should be >= 32 bits): 36 | * gcc -Wall -Werror -g -ansi -pedantic -O2 bloom_filter.c -o bloom_filter 37 | * 38 | * example usage ("words" is from /usr/share/dict/words on debian): 39 | * $ ./bloom_filter 40 | * usage: ./bloom_filter dictionary word ... 41 | * $ ./bloom_filter words test word words foo bar baz not_in_dict 42 | * "test" in dictionary 43 | * "word" in dictionary 44 | * "words" in dictionary 45 | * "foo" not in dictionary 46 | * "bar" in dictionary 47 | * "baz" not in dictionary 48 | * "not_in_dict" not in dictionary 49 | * 50 | * this implementation uses 7 hash functions, the optimal number for an input 51 | * of approx. 100,000 and a filter size of 2^20 bits -- a ~7.2x decrease in 52 | * space needed (from 912KB to 128KB), with an false positive rate of 0.007. 53 | * consult Wikipedia for optimal numbers for your input and desired size 54 | * 55 | * all the hash functions are under the Common Public License and from: 56 | * http://www.partow.net/programming/hashfunctions/index.html 57 | * although I haven't rigorously tested them -- for real use, you'll probably 58 | * want to make sure they work well for your data set or choose others. 59 | * 60 | * other hash functions of interest: 61 | * http://www.cse.yorku.ca/~oz/hash.html 62 | * http://www.isthe.com/chongo/tech/comp/fnv/ 63 | * http://www.azillionmonkeys.com/qed/hash.html 64 | * http://murmurhash.googlepages.com/ 65 | * 66 | *******************************************************************************/ 67 | 68 | /*************************************************** 69 | * This file has been modified by Min Fu in 11/21/2012. 70 | * get_hashes() has been replaced with an array of function pointers, hash_func[]. 71 | * ************************************************/ 72 | 73 | #include 74 | #include 75 | #include 76 | #include "bloom_filter.h" 77 | 78 | 79 | /* hash functions */ 80 | unsigned int RSHash (unsigned char *, unsigned int); 81 | unsigned int DJBHash (unsigned char *, unsigned int); 82 | unsigned int FNVHash (unsigned char *, unsigned int); 83 | unsigned int JSHash (unsigned char *, unsigned int); 84 | unsigned int PJWHash (unsigned char *, unsigned int); 85 | unsigned int SDBMHash(unsigned char *, unsigned int); 86 | unsigned int DEKHash (unsigned char *, unsigned int); 87 | 88 | unsigned int (*hash_func[])(unsigned char *, unsigned int) = { 89 | RSHash, 90 | DJBHash, 91 | FNVHash, 92 | JSHash, 93 | PJWHash, 94 | SDBMHash, 95 | DEKHash 96 | }; 97 | 98 | void insert_word(unsigned char filter[], char *str, int len) 99 | { 100 | unsigned long hash[NUM_HASHES]; 101 | int i; 102 | 103 | for (i = 0; i < NUM_HASHES; i++) { 104 | hash[i] = hash_func[i](str, len); 105 | /* xor-fold the hash into FILTER_SIZE bits */ 106 | hash[i] = (hash[i] >> FILTER_SIZE) ^ 107 | (hash[i] & FILTER_BITMASK); 108 | /* set the bit in the filter */ 109 | filter[hash[i] >> 3] |= 1 << (hash[i] & 7); 110 | } 111 | } 112 | 113 | int in_dict(unsigned char filter[], char *str, int len) 114 | { 115 | unsigned int hash[NUM_HASHES]; 116 | int i; 117 | 118 | for (i = 0; i < NUM_HASHES; i++) { 119 | hash[i] = hash_func[i](str, len); 120 | hash[i] = (hash[i] >> FILTER_SIZE) ^ 121 | (hash[i] & FILTER_BITMASK); 122 | if (!(filter[hash[i] >> 3] & (1 << (hash[i] & 7)))) 123 | return 0; 124 | } 125 | 126 | return 1; 127 | } 128 | 129 | /****************\ 130 | | Hash Functions | 131 | \****************/ 132 | 133 | unsigned int RSHash(unsigned char *str, unsigned int len) 134 | { 135 | unsigned int b = 378551; 136 | unsigned int a = 63689; 137 | unsigned int hash = 0; 138 | unsigned int i = 0; 139 | 140 | for(i = 0; i < len; str++, i++) 141 | { 142 | hash = hash * a + (*str); 143 | a = a * b; 144 | } 145 | 146 | return hash; 147 | } 148 | 149 | unsigned int JSHash(unsigned char *str, unsigned int len) 150 | { 151 | unsigned int hash = 1315423911; 152 | unsigned int i = 0; 153 | 154 | for(i = 0; i < len; str++, i++) 155 | { 156 | hash ^= ((hash << 5) + (*str) + (hash >> 2)); 157 | } 158 | 159 | return hash; 160 | } 161 | 162 | unsigned int PJWHash(unsigned char *str, unsigned int len) 163 | { 164 | const unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); 165 | const unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); 166 | const unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); 167 | const unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); 168 | unsigned int hash = 0; 169 | unsigned int test = 0; 170 | unsigned int i = 0; 171 | 172 | for(i = 0; i < len; str++, i++) 173 | { 174 | hash = (hash << OneEighth) + (*str); 175 | 176 | if((test = hash & HighBits) != 0) 177 | { 178 | hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); 179 | } 180 | } 181 | 182 | return hash; 183 | } 184 | 185 | unsigned int SDBMHash(unsigned char *str, unsigned int len) 186 | { 187 | unsigned int hash = 0; 188 | unsigned int i = 0; 189 | 190 | for(i = 0; i < len; str++, i++) 191 | { 192 | hash = (*str) + (hash << 6) + (hash << 16) - hash; 193 | } 194 | 195 | return hash; 196 | } 197 | 198 | unsigned int DJBHash(unsigned char *str, unsigned int len) 199 | { 200 | unsigned int hash = 5381; 201 | unsigned int i = 0; 202 | 203 | for(i = 0; i < len; str++, i++) 204 | { 205 | hash = ((hash << 5) + hash) + (*str); 206 | } 207 | 208 | return hash; 209 | } 210 | 211 | unsigned int DEKHash(unsigned char *str, unsigned int len) 212 | { 213 | unsigned int hash = len; 214 | unsigned int i = 0; 215 | 216 | for(i = 0; i < len; str++, i++) 217 | { 218 | hash = ((hash << 5) ^ (hash >> 27)) ^ (*str); 219 | } 220 | return hash; 221 | } 222 | 223 | unsigned int FNVHash(unsigned char *str, unsigned int len) 224 | { 225 | const unsigned int fnv_prime = 0x811C9DC5; 226 | unsigned int hash = 0; 227 | unsigned int i = 0; 228 | 229 | for(i = 0; i < len; str++, i++) 230 | { 231 | hash *= fnv_prime; 232 | hash ^= (*str); 233 | } 234 | 235 | return hash; 236 | } 237 | -------------------------------------------------------------------------------- /src/optimal_restore.c: -------------------------------------------------------------------------------- 1 | /* 2 | * optimal_restore.c 3 | * 4 | * Created on: Nov 27, 2013 5 | * Author: fumin 6 | */ 7 | #include "destor.h" 8 | #include "jcr.h" 9 | #include "recipe/recipestore.h" 10 | #include "storage/containerstore.h" 11 | #include "restore.h" 12 | #include "utils/lru_cache.h" 13 | 14 | /* Consisting of a sequence of access records with an identical ID */ 15 | struct accessRecords { 16 | containerid cid; 17 | /* A queue of sequence numbers */ 18 | GQueue *seqno_queue; 19 | }; 20 | 21 | static struct accessRecords* new_access_records(containerid id) { 22 | struct accessRecords* r = (struct accessRecords*) malloc( 23 | sizeof(struct accessRecords)); 24 | r->cid = id; 25 | r->seqno_queue = g_queue_new(); 26 | return r; 27 | } 28 | 29 | static void free_access_records(struct accessRecords* r) { 30 | assert(g_queue_get_length(r->seqno_queue) == 0); 31 | g_queue_free_full(r->seqno_queue, free); 32 | free(r); 33 | } 34 | 35 | /* 36 | * Ascending order. 37 | */ 38 | static gint g_access_records_cmp_by_first_seqno(struct accessRecords *a, 39 | struct accessRecords *b, gpointer data) { 40 | int *da = g_queue_peek_head(a->seqno_queue); 41 | if (da == NULL) 42 | return 1; 43 | 44 | int *db = g_queue_peek_head(b->seqno_queue); 45 | if (db == NULL) 46 | return -1; 47 | 48 | return *da - *db; 49 | } 50 | 51 | struct { 52 | 53 | /* the seqno of next read access record */ 54 | int current_sequence_number; 55 | 56 | /* Index buffered access records by id. */ 57 | GHashTable *access_record_table; 58 | int buffered_access_record_num; 59 | 60 | /* Access records of cached containers. */ 61 | GSequence *sorted_records_of_cached_containers; 62 | 63 | /* Container queue. */ 64 | struct lruCache *lru_queue; 65 | 66 | } optimal_cache; 67 | 68 | static void optimal_cache_window_fill() { 69 | int n = destor.restore_opt_window_size - optimal_cache.buffered_access_record_num, k; 70 | containerid *ids = read_next_n_records(jcr.bv, n, &k); 71 | if (ids) { 72 | optimal_cache.buffered_access_record_num += k; 73 | 74 | /* update distance_seq */ 75 | int i; 76 | for (i = 1; i <= k; i++) { 77 | 78 | struct accessRecords* r = g_hash_table_lookup( 79 | optimal_cache.access_record_table, &ids[i]); 80 | if (!r) { 81 | r = new_access_records(ids[i]); 82 | g_hash_table_insert(optimal_cache.access_record_table, &r->cid, r); 83 | } 84 | 85 | int *no = (int*) malloc(sizeof(int)); 86 | *no = optimal_cache.current_sequence_number++; 87 | g_queue_push_tail(r->seqno_queue, no); 88 | 89 | } 90 | } 91 | } 92 | 93 | /* 94 | * Init the optimal cache. 95 | */ 96 | void init_optimal_cache() { 97 | optimal_cache.current_sequence_number = 0; 98 | 99 | optimal_cache.access_record_table = g_hash_table_new_full(g_int64_hash, g_int64_equal, 100 | NULL, free_access_records); 101 | optimal_cache.buffered_access_record_num = 0; 102 | 103 | optimal_cache.sorted_records_of_cached_containers = g_sequence_new(NULL); 104 | 105 | if (destor.simulation_level == SIMULATION_NO) 106 | optimal_cache.lru_queue = new_lru_cache(destor.restore_cache[1], 107 | free_container, lookup_fingerprint_in_container); 108 | else 109 | optimal_cache.lru_queue = new_lru_cache(destor.restore_cache[1], 110 | free_container_meta, lookup_fingerprint_in_container_meta); 111 | 112 | optimal_cache_window_fill(); 113 | } 114 | 115 | static void optimal_cache_window_slides(containerid id) { 116 | 117 | if (optimal_cache.buffered_access_record_num * 2 <= destor.restore_opt_window_size) 118 | optimal_cache_window_fill(); 119 | 120 | struct accessRecords *r = g_hash_table_lookup(optimal_cache.access_record_table, &id); 121 | assert(r); 122 | int *d = g_queue_pop_head(r->seqno_queue); 123 | free(d); 124 | optimal_cache.buffered_access_record_num--; 125 | 126 | } 127 | 128 | /* 129 | * Before looking up a fingerprint, 130 | * we call optimal_cache_hits to check whether 131 | * the target container is in the cache. 132 | */ 133 | static int optimal_cache_hits(containerid id) { 134 | static containerid last_id = TEMPORARY_ID; 135 | if (last_id != id) { 136 | optimal_cache_window_slides(id); 137 | last_id = id; 138 | } 139 | 140 | if (destor.simulation_level == SIMULATION_NO) 141 | return lru_cache_hits(optimal_cache.lru_queue, &id, 142 | container_check_id) == NULL ? 0 : 1; 143 | else 144 | return lru_cache_hits(optimal_cache.lru_queue, &id, 145 | container_meta_check_id) == NULL ? 0 : 1; 146 | } 147 | 148 | /* The function will not be called if the simulation level >= RESTORE. */ 149 | static struct chunk* optimal_cache_lookup(fingerprint *fp) { 150 | 151 | assert(destor.simulation_level == SIMULATION_NO); 152 | 153 | struct container* con = lru_cache_lookup(optimal_cache.lru_queue, fp); 154 | struct chunk* c = get_chunk_in_container(con, fp); 155 | assert(c); 156 | 157 | return c; 158 | } 159 | 160 | struct accessRecords* victim; 161 | 162 | static int find_kicked_container(struct container* con, GHashTable *ht) { 163 | 164 | struct accessRecords* r = g_hash_table_lookup(ht, &con->meta.id); 165 | if(r){ 166 | victim = r; 167 | return 1; 168 | } 169 | return 0; 170 | } 171 | 172 | static int find_kicked_container_meta(struct containerMeta* cm, GHashTable *ht) { 173 | 174 | struct accessRecords* r = g_hash_table_lookup(ht, &cm->id); 175 | if(r){ 176 | victim = r; 177 | return 1; 178 | } 179 | return 0; 180 | } 181 | 182 | static void optimal_cache_insert(containerid id) { 183 | 184 | if (lru_cache_is_full(optimal_cache.lru_queue)) { 185 | GHashTable* ht = g_hash_table_new(g_int64_hash, g_int64_equal); 186 | 187 | /* 188 | * re-sort the sequence. 189 | */ 190 | g_sequence_sort(optimal_cache.sorted_records_of_cached_containers, 191 | g_access_records_cmp_by_first_seqno, NULL); 192 | 193 | GSequenceIter *iter = g_sequence_iter_prev( 194 | g_sequence_get_end_iter(optimal_cache.sorted_records_of_cached_containers)); 195 | struct accessRecords* r = g_sequence_get(iter); 196 | g_hash_table_insert(ht, &r->cid, r); 197 | 198 | int i = 0; 199 | while (i < 10 && iter != g_sequence_get_begin_iter(optimal_cache.sorted_records_of_cached_containers)) { 200 | iter = g_sequence_iter_prev(iter); 201 | r = g_sequence_get(iter); 202 | if (g_queue_get_length(r->seqno_queue) == 0) 203 | g_hash_table_insert(ht, &r->cid, r); 204 | else 205 | break; 206 | i++; 207 | } 208 | 209 | if (destor.simulation_level == SIMULATION_NO) 210 | lru_cache_kicks(optimal_cache.lru_queue, ht, find_kicked_container); 211 | else 212 | lru_cache_kicks(optimal_cache.lru_queue, ht, 213 | find_kicked_container_meta); 214 | 215 | g_hash_table_destroy(ht); 216 | 217 | iter = g_sequence_iter_prev( 218 | g_sequence_get_end_iter(optimal_cache.sorted_records_of_cached_containers)); 219 | r = g_sequence_get(iter); 220 | while (r->cid != victim->cid) { 221 | iter = g_sequence_iter_prev(iter); 222 | r = g_sequence_get(iter); 223 | } 224 | 225 | /* iter (r) points to the evicted record */ 226 | if(g_queue_get_length(r->seqno_queue) == 0){ 227 | /* the container will not be accessed in the future */ 228 | g_hash_table_remove(optimal_cache.access_record_table, &r->cid); 229 | } 230 | 231 | g_sequence_remove(iter); 232 | } 233 | 234 | jcr.read_container_num++; 235 | if (destor.simulation_level == SIMULATION_NO) { 236 | struct container* con = retrieve_container_by_id(id); 237 | lru_cache_insert(optimal_cache.lru_queue, con, NULL, NULL); 238 | } else { 239 | struct containerMeta *cm = retrieve_container_meta_by_id(id); 240 | lru_cache_insert(optimal_cache.lru_queue, cm, NULL, NULL); 241 | } 242 | 243 | struct accessRecords* r = g_hash_table_lookup(optimal_cache.access_record_table, &id); 244 | assert(r); 245 | 246 | g_sequence_insert_sorted(optimal_cache.sorted_records_of_cached_containers, r, 247 | g_access_records_cmp_by_first_seqno, NULL); 248 | 249 | } 250 | 251 | void* optimal_restore_thread(void *arg) { 252 | init_optimal_cache(); 253 | 254 | struct chunk* c; 255 | while ((c = sync_queue_pop(restore_recipe_queue))) { 256 | 257 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) { 258 | sync_queue_push(restore_chunk_queue, c); 259 | continue; 260 | } 261 | 262 | TIMER_DECLARE(1); 263 | TIMER_BEGIN(1); 264 | 265 | if (!optimal_cache_hits(c->id)) { 266 | VERBOSE("Restore cache: container %lld is missed", c->id); 267 | optimal_cache_insert(c->id); 268 | } 269 | 270 | if (destor.simulation_level == SIMULATION_NO) { 271 | struct chunk* rc = optimal_cache_lookup(&c->fp); 272 | TIMER_END(1, jcr.read_chunk_time); 273 | sync_queue_push(restore_chunk_queue, rc); 274 | } else { 275 | TIMER_END(1, jcr.read_chunk_time); 276 | } 277 | 278 | jcr.data_size += c->size; 279 | jcr.chunk_num++; 280 | free_chunk(c); 281 | } 282 | 283 | sync_queue_term(restore_chunk_queue); 284 | return NULL; 285 | } 286 | -------------------------------------------------------------------------------- /src/do_restore.c: -------------------------------------------------------------------------------- 1 | #include "destor.h" 2 | #include "jcr.h" 3 | #include "recipe/recipestore.h" 4 | #include "storage/containerstore.h" 5 | #include "utils/lru_cache.h" 6 | #include "restore.h" 7 | 8 | static void* lru_restore_thread(void *arg) { 9 | struct lruCache *cache; 10 | if (destor.simulation_level >= SIMULATION_RESTORE) 11 | cache = new_lru_cache(destor.restore_cache[1], free_container_meta, 12 | lookup_fingerprint_in_container_meta); 13 | else 14 | cache = new_lru_cache(destor.restore_cache[1], free_container, 15 | lookup_fingerprint_in_container); 16 | 17 | struct chunk* c; 18 | while ((c = sync_queue_pop(restore_recipe_queue))) { 19 | 20 | if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) { 21 | sync_queue_push(restore_chunk_queue, c); 22 | continue; 23 | } 24 | 25 | TIMER_DECLARE(1); 26 | TIMER_BEGIN(1); 27 | 28 | if (destor.simulation_level >= SIMULATION_RESTORE) { 29 | struct containerMeta *cm = lru_cache_lookup(cache, &c->fp); 30 | if (!cm) { 31 | VERBOSE("Restore cache: container %lld is missed", c->id); 32 | cm = retrieve_container_meta_by_id(c->id); 33 | assert(lookup_fingerprint_in_container_meta(cm, &c->fp)); 34 | lru_cache_insert(cache, cm, NULL, NULL); 35 | jcr.read_container_num++; 36 | } 37 | 38 | TIMER_END(1, jcr.read_chunk_time); 39 | } else { 40 | struct container *con = lru_cache_lookup(cache, &c->fp); 41 | if (!con) { 42 | VERBOSE("Restore cache: container %lld is missed", c->id); 43 | con = retrieve_container_by_id(c->id); 44 | lru_cache_insert(cache, con, NULL, NULL); 45 | jcr.read_container_num++; 46 | } 47 | struct chunk *rc = get_chunk_in_container(con, &c->fp); 48 | assert(rc); 49 | TIMER_END(1, jcr.read_chunk_time); 50 | sync_queue_push(restore_chunk_queue, rc); 51 | } 52 | 53 | jcr.data_size += c->size; 54 | jcr.chunk_num++; 55 | free_chunk(c); 56 | } 57 | 58 | sync_queue_term(restore_chunk_queue); 59 | 60 | free_lru_cache(cache); 61 | 62 | return NULL; 63 | } 64 | 65 | static void* read_recipe_thread(void *arg) { 66 | 67 | int i, j, k; 68 | for (i = 0; i < jcr.bv->number_of_files; i++) { 69 | TIMER_DECLARE(1); 70 | TIMER_BEGIN(1); 71 | 72 | struct fileRecipeMeta *r = read_next_file_recipe_meta(jcr.bv); 73 | 74 | struct chunk *c = new_chunk(sdslen(r->filename) + 1); 75 | strcpy(c->data, r->filename); 76 | SET_CHUNK(c, CHUNK_FILE_START); 77 | 78 | TIMER_END(1, jcr.read_recipe_time); 79 | 80 | sync_queue_push(restore_recipe_queue, c); 81 | 82 | for (j = 0; j < r->chunknum; j++) { 83 | TIMER_DECLARE(1); 84 | TIMER_BEGIN(1); 85 | 86 | struct chunkPointer* cp = read_next_n_chunk_pointers(jcr.bv, 1, &k); 87 | 88 | struct chunk* c = new_chunk(0); 89 | memcpy(&c->fp, &cp->fp, sizeof(fingerprint)); 90 | c->size = cp->size; 91 | c->id = cp->id; 92 | 93 | TIMER_END(1, jcr.read_recipe_time); 94 | 95 | sync_queue_push(restore_recipe_queue, c); 96 | free(cp); 97 | } 98 | 99 | c = new_chunk(0); 100 | SET_CHUNK(c, CHUNK_FILE_END); 101 | sync_queue_push(restore_recipe_queue, c); 102 | 103 | free_file_recipe_meta(r); 104 | } 105 | 106 | sync_queue_term(restore_recipe_queue); 107 | return NULL; 108 | } 109 | 110 | void* write_restore_data(void* arg) { 111 | 112 | char *p, *q; 113 | q = jcr.path + 1;/* ignore the first char*/ 114 | /* 115 | * recursively make directory 116 | */ 117 | while ((p = strchr(q, '/'))) { 118 | if (*p == *(p - 1)) { 119 | q++; 120 | continue; 121 | } 122 | *p = 0; 123 | if (access(jcr.path, 0) != 0) { 124 | mkdir(jcr.path, S_IRWXU | S_IRWXG | S_IRWXO); 125 | } 126 | *p = '/'; 127 | q = p + 1; 128 | } 129 | 130 | struct chunk *c = NULL; 131 | FILE *fp = NULL; 132 | 133 | while ((c = sync_queue_pop(restore_chunk_queue))) { 134 | 135 | TIMER_DECLARE(1); 136 | TIMER_BEGIN(1); 137 | 138 | if (CHECK_CHUNK(c, CHUNK_FILE_START)) { 139 | VERBOSE("Restoring: %s", c->data); 140 | 141 | sds filepath = sdsdup(jcr.path); 142 | filepath = sdscat(filepath, c->data); 143 | 144 | int len = sdslen(jcr.path); 145 | char *q = filepath + len; 146 | char *p; 147 | while ((p = strchr(q, '/'))) { 148 | if (*p == *(p - 1)) { 149 | q++; 150 | continue; 151 | } 152 | *p = 0; 153 | if (access(filepath, 0) != 0) { 154 | mkdir(filepath, S_IRWXU | S_IRWXG | S_IRWXO); 155 | } 156 | *p = '/'; 157 | q = p + 1; 158 | } 159 | 160 | if (destor.simulation_level == SIMULATION_NO) { 161 | assert(fp == NULL); 162 | fp = fopen(filepath, "w"); 163 | } 164 | 165 | sdsfree(filepath); 166 | 167 | } else if (CHECK_CHUNK(c, CHUNK_FILE_END)) { 168 | jcr.file_num++; 169 | 170 | if (fp) 171 | fclose(fp); 172 | fp = NULL; 173 | } else { 174 | assert(destor.simulation_level == SIMULATION_NO); 175 | VERBOSE("Restoring %d bytes", c->size); 176 | fwrite(c->data, c->size, 1, fp); 177 | } 178 | 179 | free_chunk(c); 180 | 181 | TIMER_END(1, jcr.write_chunk_time); 182 | } 183 | 184 | jcr.status = JCR_STATUS_DONE; 185 | return NULL; 186 | } 187 | 188 | void do_restore(int revision, char *path) { 189 | 190 | init_recipe_store(); 191 | init_container_store(); 192 | 193 | init_restore_jcr(revision, path); 194 | 195 | destor_log(DESTOR_NOTICE, "job id: %d", jcr.id); 196 | destor_log(DESTOR_NOTICE, "backup path: %s", jcr.bv->path); 197 | destor_log(DESTOR_NOTICE, "restore to: %s", jcr.path); 198 | 199 | restore_chunk_queue = sync_queue_new(100); 200 | restore_recipe_queue = sync_queue_new(100); 201 | 202 | TIMER_DECLARE(1); 203 | TIMER_BEGIN(1); 204 | 205 | puts("==== restore begin ===="); 206 | 207 | jcr.status = JCR_STATUS_RUNNING; 208 | pthread_t recipe_t, read_t, write_t; 209 | pthread_create(&recipe_t, NULL, read_recipe_thread, NULL); 210 | 211 | if (destor.restore_cache[0] == RESTORE_CACHE_LRU) { 212 | destor_log(DESTOR_NOTICE, "restore cache is LRU"); 213 | pthread_create(&read_t, NULL, lru_restore_thread, NULL); 214 | } else if (destor.restore_cache[0] == RESTORE_CACHE_OPT) { 215 | destor_log(DESTOR_NOTICE, "restore cache is OPT"); 216 | pthread_create(&read_t, NULL, optimal_restore_thread, NULL); 217 | } else if (destor.restore_cache[0] == RESTORE_CACHE_ASM) { 218 | destor_log(DESTOR_NOTICE, "restore cache is ASM"); 219 | pthread_create(&read_t, NULL, assembly_restore_thread, NULL); 220 | } else { 221 | fprintf(stderr, "Invalid restore cache.\n"); 222 | exit(1); 223 | } 224 | 225 | pthread_create(&write_t, NULL, write_restore_data, NULL); 226 | 227 | do{ 228 | sleep(5); 229 | /*time_t now = time(NULL);*/ 230 | fprintf(stderr, "%" PRId64 " bytes, %" PRId32 " chunks, %d files processed\r", 231 | jcr.data_size, jcr.chunk_num, jcr.file_num); 232 | }while(jcr.status == JCR_STATUS_RUNNING || jcr.status != JCR_STATUS_DONE); 233 | fprintf(stderr, "%" PRId64 " bytes, %" PRId32 " chunks, %d files processed\n", 234 | jcr.data_size, jcr.chunk_num, jcr.file_num); 235 | 236 | assert(sync_queue_size(restore_chunk_queue) == 0); 237 | assert(sync_queue_size(restore_recipe_queue) == 0); 238 | 239 | free_backup_version(jcr.bv); 240 | 241 | TIMER_END(1, jcr.total_time); 242 | puts("==== restore end ===="); 243 | 244 | printf("job id: %" PRId32 "\n", jcr.id); 245 | printf("restore path: %s\n", jcr.path); 246 | printf("number of files: %" PRId32 "\n", jcr.file_num); 247 | printf("number of chunks: %" PRId32"\n", jcr.chunk_num); 248 | printf("total size(B): %" PRId64 "\n", jcr.data_size); 249 | printf("total time(s): %.3f\n", jcr.total_time / 1000000); 250 | printf("throughput(MB/s): %.2f\n", 251 | jcr.data_size * 1000000 / (1024.0 * 1024 * jcr.total_time)); 252 | printf("speed factor: %.2f\n", 253 | jcr.data_size / (1024.0 * 1024 * jcr.read_container_num)); 254 | 255 | printf("read_recipe_time : %.3fs, %.2fMB/s\n", 256 | jcr.read_recipe_time / 1000000, 257 | jcr.data_size * 1000000 / jcr.read_recipe_time / 1024 / 1024); 258 | printf("read_chunk_time : %.3fs, %.2fMB/s\n", jcr.read_chunk_time / 1000000, 259 | jcr.data_size * 1000000 / jcr.read_chunk_time / 1024 / 1024); 260 | printf("write_chunk_time : %.3fs, %.2fMB/s\n", 261 | jcr.write_chunk_time / 1000000, 262 | jcr.data_size * 1000000 / jcr.write_chunk_time / 1024 / 1024); 263 | 264 | char logfile[] = "restore.log"; 265 | FILE *fp = fopen(logfile, "a"); 266 | 267 | /* 268 | * job id, 269 | * chunk num, 270 | * data size, 271 | * actually read container number, 272 | * speed factor, 273 | * throughput 274 | */ 275 | fprintf(fp, "%" PRId32 " %" PRId64 " %" PRId32 " %.4f %.4f\n", jcr.id, jcr.data_size, 276 | jcr.read_container_num, 277 | jcr.data_size / (1024.0 * 1024 * jcr.read_container_num), 278 | jcr.data_size * 1000000 / (1024 * 1024 * jcr.total_time)); 279 | 280 | fclose(fp); 281 | 282 | close_container_store(); 283 | close_recipe_store(); 284 | } 285 | 286 | --------------------------------------------------------------------------------