├── LICENSE ├── Makefile ├── README.md ├── fastcdc.c ├── fastcdc.h └── test.c /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Joseph Calderon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OBJS=\ 2 | fastcdc.o 3 | 4 | CPPFLAGS := -D_DEFAULT_SOURCE 5 | CFLAGS := -g -O3 -std=c99 -Wall 6 | LDLIBS := -lm 7 | 8 | all: test 9 | 10 | test: $(OBJS) 11 | 12 | .PHONY: clean indent scan 13 | clean: 14 | $(RM) *.o *.a 15 | 16 | indent: 17 | clang-format -i *.h *.c 18 | 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastcdc 2 | small, simple fastcdc implementation in c99 3 | 4 | ## references 5 | * the original [paper](https://www.usenix.org/system/files/conference/atc16/atc16-paper-xia.pdf) 6 | * cython implementation within [iscc/fastcdc-py](https://github.com/iscc/fastcdc-py) 7 | * node c++ binding [ronomon/deduplication](https://github.com/ronomon/deduplication) 8 | -------------------------------------------------------------------------------- /fastcdc.c: -------------------------------------------------------------------------------- 1 | #include "fastcdc.h" 2 | #include 3 | 4 | #define FASTCDC_CLAMP(x, a, b) ((x < (a)) ? (a) : ((x > b) ? b : x)) 5 | #define FASTCDC_DIVCEIL(a, b) ((a) / (b) + ((a) % (b) ? 1 : 0)) 6 | #define FASTCDC_MASK(x) (uint32_t)((1 << FASTCDC_CLAMP(x, 1, 31)) - 1) 7 | 8 | #define AVERAGE_MIN (1 << 8) 9 | #define AVERAGE_MAX (1 << 28) 10 | #define MINIMUM_MIN (AVERAGE_MIN >> 2) 11 | #define MINIMUM_MAX (AVERAGE_MAX >> 2) 12 | #define MAXIMUM_MIN (AVERAGE_MIN << 2) 13 | #define MAXIMUM_MAX (AVERAGE_MAX << 2) 14 | 15 | static const uint32_t GEAR[] = { 16 | 0x5C95C078, 0x22408989, 0x2D48A214, 0x12842087, 0x530F8AFB, 0x474536B9, 17 | 0x2963B4F1, 0x44CB738B, 0x4EA7403D, 0x4D606B6E, 0x074EC5D3, 0x3AF39D18, 18 | 0x726003CA, 0x37A62A74, 0x51A2F58E, 0x7506358E, 0x5D4AB128, 0x4D4AE17B, 19 | 0x41E85924, 0x470C36F7, 0x4741CBE1, 0x01BB7F30, 0x617C1DE3, 0x2B0C3A1F, 20 | 0x50C48F73, 0x21A82D37, 0x6095ACE0, 0x419167A0, 0x3CAF49B0, 0x40CEA62D, 21 | 0x66BC1C66, 0x545E1DAD, 0x2BFA77CD, 0x6E85DA24, 0x5FB0BDC5, 0x652CFC29, 22 | 0x3A0AE1AB, 0x2837E0F3, 0x6387B70E, 0x13176012, 0x4362C2BB, 0x66D8F4B1, 23 | 0x37FCE834, 0x2C9CD386, 0x21144296, 0x627268A8, 0x650DF537, 0x2805D579, 24 | 0x3B21EBBD, 0x7357ED34, 0x3F58B583, 0x7150DDCA, 0x7362225E, 0x620A6070, 25 | 0x2C5EF529, 0x7B522466, 0x768B78C0, 0x4B54E51E, 0x75FA07E5, 0x06A35FC6, 26 | 0x30B71024, 0x1C8626E1, 0x296AD578, 0x28D7BE2E, 0x1490A05A, 0x7CEE43BD, 27 | 0x698B56E3, 0x09DC0126, 0x4ED6DF6E, 0x02C1BFC7, 0x2A59AD53, 0x29C0E434, 28 | 0x7D6C5278, 0x507940A7, 0x5EF6BA93, 0x68B6AF1E, 0x46537276, 0x611BC766, 29 | 0x155C587D, 0x301BA847, 0x2CC9DDA7, 0x0A438E2C, 0x0A69D514, 0x744C72D3, 30 | 0x4F326B9B, 0x7EF34286, 0x4A0EF8A7, 0x6AE06EBE, 0x669C5372, 0x12402DCB, 31 | 0x5FEAE99D, 0x76C7F4A7, 0x6ABDB79C, 0x0DFAA038, 0x20E2282C, 0x730ED48B, 32 | 0x069DAC2F, 0x168ECF3E, 0x2610E61F, 0x2C512C8E, 0x15FB8C06, 0x5E62BC76, 33 | 0x69555135, 0x0ADB864C, 0x4268F914, 0x349AB3AA, 0x20EDFDB2, 0x51727981, 34 | 0x37B4B3D8, 0x5DD17522, 0x6B2CBFE4, 0x5C47CF9F, 0x30FA1CCD, 0x23DEDB56, 35 | 0x13D1F50A, 0x64EDDEE7, 0x0820B0F7, 0x46E07308, 0x1E2D1DFD, 0x17B06C32, 36 | 0x250036D8, 0x284DBF34, 0x68292EE0, 0x362EC87C, 0x087CB1EB, 0x76B46720, 37 | 0x104130DB, 0x71966387, 0x482DC43F, 0x2388EF25, 0x524144E1, 0x44BD834E, 38 | 0x448E7DA3, 0x3FA6EAF9, 0x3CDA215C, 0x3A500CF3, 0x395CB432, 0x5195129F, 39 | 0x43945F87, 0x51862CA4, 0x56EA8FF1, 0x201034DC, 0x4D328FF5, 0x7D73A909, 40 | 0x6234D379, 0x64CFBF9C, 0x36F6589A, 0x0A2CE98A, 0x5FE4D971, 0x03BC15C5, 41 | 0x44021D33, 0x16C1932B, 0x37503614, 0x1ACAF69D, 0x3F03B779, 0x49E61A03, 42 | 0x1F52D7EA, 0x1C6DDD5C, 0x062218CE, 0x07E7A11A, 0x1905757A, 0x7CE00A53, 43 | 0x49F44F29, 0x4BCC70B5, 0x39FEEA55, 0x5242CEE8, 0x3CE56B85, 0x00B81672, 44 | 0x46BEECCC, 0x3CA0AD56, 0x2396CEE8, 0x78547F40, 0x6B08089B, 0x66A56751, 45 | 0x781E7E46, 0x1E2CF856, 0x3BC13591, 0x494A4202, 0x520494D7, 0x2D87459A, 46 | 0x757555B6, 0x42284CC1, 0x1F478507, 0x75C95DFF, 0x35FF8DD7, 0x4E4757ED, 47 | 0x2E11F88C, 0x5E1B5048, 0x420E6699, 0x226B0695, 0x4D1679B4, 0x5A22646F, 48 | 0x161D1131, 0x125C68D9, 0x1313E32E, 0x4AA85724, 0x21DC7EC1, 0x4FFA29FE, 49 | 0x72968382, 0x1CA8EEF3, 0x3F3B1C28, 0x39C2FB6C, 0x6D76493F, 0x7A22A62E, 50 | 0x789B1C2A, 0x16E0CB53, 0x7DECEEEB, 0x0DC7E1C6, 0x5C75BF3D, 0x52218333, 51 | 0x106DE4D6, 0x7DC64422, 0x65590FF4, 0x2C02EC30, 0x64A9AC67, 0x59CAB2E9, 52 | 0x4A21D2F3, 0x0F616E57, 0x23B54EE8, 0x02730AAA, 0x2F3C634D, 0x7117FC6C, 53 | 0x01AC6F05, 0x5A9ED20C, 0x158C4E2A, 0x42B699F0, 0x0C7C14B3, 0x02BD9641, 54 | 0x15AD56FC, 0x1C722F60, 0x7DA1AF91, 0x23E0DBCB, 0x0E93E12B, 0x64B2791D, 55 | 0x440D2476, 0x588EA8DD, 0x4665A658, 0x7446C418, 0x1877A774, 0x5626407E, 56 | 0x7F63BD46, 0x32D2DBD8, 0x3C790F4A, 0x772B7239, 0x6F8B2826, 0x677FF609, 57 | 0x0DC82C11, 0x23FFE354, 0x2EAC53A6, 0x16139E09, 0x0AFD0DBC, 0x2A4D4237, 58 | 0x56A368C7, 0x234325E4, 0x2DCE9187, 0x32E8EA7E}; 59 | 60 | static uint32_t normal_size(const uint32_t mi, const uint32_t av, 61 | const uint32_t len) { 62 | uint32_t off = mi + FASTCDC_DIVCEIL(mi, 2); 63 | if (off > av) 64 | off = av; 65 | uint32_t sz = av - off; 66 | if (sz > len) 67 | return len; 68 | return sz; 69 | } 70 | 71 | static uint32_t cut(const uint8_t *src, const uint32_t len, const uint32_t mi, 72 | const uint32_t ma, const uint32_t ns, const uint32_t mask_s, 73 | const uint32_t mask_l) { 74 | 75 | #define UPDATE_HASH(mask) \ 76 | for (; i < n; i++) { \ 77 | fp = (fp >> 1) + GEAR[src[i]]; \ 78 | if ((fp & mask) == 0) \ 79 | return i + 1; \ 80 | } 81 | 82 | uint32_t n, fp = 0, i = (len < mi) ? len : mi; 83 | n = (ns < len) ? ns : len; 84 | UPDATE_HASH(mask_s); 85 | n = (ma < len) ? ma : len; 86 | UPDATE_HASH(mask_l); 87 | 88 | #undef UPDATE_HASH 89 | return i; 90 | } 91 | 92 | fcdc_ctx fastcdc_init(uint32_t mi, uint32_t av, uint32_t ma) { 93 | uint32_t bits = (uint32_t)round(log2(av)); 94 | fcdc_ctx ctx = { 95 | .mi = FASTCDC_CLAMP(mi, MINIMUM_MIN, MINIMUM_MAX), 96 | .av = FASTCDC_CLAMP(av, AVERAGE_MIN, AVERAGE_MAX), 97 | .ma = FASTCDC_CLAMP(ma, MAXIMUM_MIN, MAXIMUM_MAX), 98 | .ns = normal_size(mi, av, ma), 99 | .mask_s = FASTCDC_MASK(bits + 1), 100 | .mask_l = FASTCDC_MASK(bits - 1), 101 | .pos = 0 102 | }; 103 | return ctx; 104 | } 105 | 106 | size_t fastcdc_update(fcdc_ctx *ctx, uint8_t *data, size_t len, int end, 107 | on_chunk cb, void *arg) { 108 | size_t offset = 0; 109 | while (((len - offset) >= ctx->ma) || (end && (offset < len))) { 110 | uint32_t cp = cut(data + offset, len - offset, ctx->mi, ctx->ma, ctx->ns, 111 | ctx->mask_s, ctx->mask_l); 112 | cb(arg, ctx->pos + offset, cp); 113 | offset += cp; 114 | } 115 | ctx->pos += offset; 116 | return offset; 117 | } 118 | 119 | size_t fastcdc_stream(FILE *stream, uint32_t mi, uint32_t av, uint32_t ma, 120 | on_chunk cb, void *arg) { 121 | size_t offset = 0; 122 | int end = 0; 123 | fcdc_ctx cdc = fastcdc_init(mi, av, ma), *ctx = &cdc; 124 | size_t rs = ctx->ma * 4; 125 | rs = FASTCDC_CLAMP(rs, 0, UINT32_MAX); 126 | uint8_t *data = malloc(rs); 127 | while (!end) { 128 | size_t ar = fread(data, 1, rs, stream); 129 | end = feof(stream); 130 | offset += fastcdc_update(ctx, data, ar, end, cb, arg); 131 | fseek(stream, offset, SEEK_SET); 132 | } 133 | free(data); 134 | return offset; 135 | } 136 | -------------------------------------------------------------------------------- /fastcdc.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTCDC_H 2 | #define FASTCDC_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef struct { 9 | uint32_t mi; 10 | uint32_t av; 11 | uint32_t ma; 12 | uint32_t ns; 13 | uint32_t mask_s; 14 | uint32_t mask_l; 15 | size_t pos; 16 | } fcdc_ctx; 17 | 18 | #define MAP_VEC_LEN 3 19 | 20 | typedef int(*on_chunk)(void* arg, size_t offset, size_t len); 21 | 22 | typedef struct { 23 | size_t chunk_id; 24 | size_t chunk_len; 25 | uint8_t front[MAP_VEC_LEN]; 26 | uint8_t end[MAP_VEC_LEN]; 27 | } map_entry; 28 | 29 | fcdc_ctx fastcdc_init(uint32_t mi, uint32_t av, uint32_t ma); 30 | size_t fastcdc_update(fcdc_ctx *ctx, uint8_t *data, size_t len, int end, 31 | on_chunk cb, void *arg); 32 | size_t fastcdc_stream(FILE *stream, uint32_t mi, uint32_t av, uint32_t ma, 33 | on_chunk cb, void *arg); 34 | #endif 35 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "fastcdc.h" 6 | 7 | int print_chunk(void *arg, size_t offset, size_t len) 8 | { 9 | static int chunk_id = 0; 10 | printf("%08zu|%08zu\n", offset, len); 11 | chunk_id++; 12 | return chunk_id; 13 | } 14 | 15 | int main(int argc, char *argv[]) { 16 | FILE *s = fopen(argv[1], "r"); 17 | if (!s) 18 | exit(-1); 19 | 20 | struct timespec tp1, tp2; 21 | 22 | clock_gettime(CLOCK_MONOTONIC, &tp1); 23 | printf("%8s|%8s\n", "offset", "length"); 24 | size_t bytes = fastcdc_stream(s, 65536 / 4, 65536, 65536 * 4, print_chunk, NULL); 25 | clock_gettime(CLOCK_MONOTONIC, &tp2); 26 | double elapsed = (tp2.tv_sec - tp1.tv_sec)*1000 + (tp2.tv_nsec - tp1.tv_nsec)/1000000.0; 27 | printf("\n======\n%.2fmb in %.2fms (%.3f mbps)\n", 1.0*bytes/(1024*1024), elapsed, 1000.0*bytes/(elapsed*1024*1024)); 28 | fclose(s); 29 | return 0; 30 | } 31 | --------------------------------------------------------------------------------