├── LICENSE ├── Makefile ├── README ├── simple9.c ├── simple9.h └── test-simple9.c /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2012 Christopher Hoobin. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above 11 | copyright notice, this list of conditions and the following 12 | disclaimer in the documentation and/or other materials provided 13 | with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY CHRISTOPHER HOOBIN ''AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CHRISTOPHER HOOBIN OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | The views and conclusions contained in the software and documentation 28 | are those of the authors and should not be interpreted as representing 29 | official policies, either expressed or implied, of Christopher Hoobin. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | 3 | CFLAGS = -Wall -Wextra -Werror -ansi -pedantic -O2 4 | 5 | CPPFLAGS = -DNDEBUG 6 | 7 | DEBUG = #-ggdb #-pg 8 | 9 | all: test-simple9 10 | 11 | test-simple9: test-simple9.o libsimple9.a 12 | $(CC) $(DEBUG) -o $@ $^ 13 | 14 | libsimple9.a: simple9.o 15 | $(AR) cr $@ $^k 16 | 17 | %.o: %.c 18 | $(CC) $(DEBUG) $(CFLAGS) $(CPPFLAGS) -c $< 19 | 20 | clean: 21 | -$(RM) -f *.o test-simple9 libsimple9.a >/dev/null 2>&1 22 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | simple9, a word-aligned integer compression algorithm as described in 2 | 3 | Vo Ngoc Anh and Alistair Moffat. Inverted index compression using 4 | word-aligned binary codes. Information Retrieval, 8(1):151–166, 2005. 5 | -------------------------------------------------------------------------------- /simple9.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Christopher Hoobin. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY CHRISTOPHER HOOBIN ''AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CHRISTOPHER HOOBIN OR 20 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | The views and conclusions contained in the software and documentation 29 | are those of the authors and should not be interpreted as representing 30 | official policies, either expressed or implied, of Christopher Hoobin. 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #define UINT32_BITS (sizeof(uint32_t) * CHAR_BIT) 42 | 43 | #define SELECTOR_MASK 0x0000000F 44 | 45 | #define SELECTOR_BITS 4 46 | 47 | #define CODE_BITS (UINT32_BITS - SELECTOR_BITS) 48 | 49 | #define MAX_VALUE ((1UL << CODE_BITS) - 1) 50 | 51 | #define NSELECTORS 9 52 | 53 | static const struct { 54 | uint32_t nitems; 55 | uint32_t nbits; 56 | uint32_t nwaste; 57 | } selectors[NSELECTORS] = { 58 | {28, 1, 0}, 59 | {14, 2, 0}, 60 | { 9, 3, 1}, 61 | { 7, 4, 0}, 62 | { 5, 5, 3}, 63 | { 4, 7, 0}, 64 | { 3, 9, 1}, 65 | { 2, 14, 0}, 66 | { 1, 28, 0}, 67 | }; 68 | 69 | static size_t vbyte_encode(size_t value, FILE *fp) 70 | { 71 | size_t nbytes = 0; 72 | uint8_t nibble; 73 | 74 | while (value >= 0x80) { 75 | nibble = (value & 0x7F) | 0x80; 76 | 77 | fwrite(&nibble, sizeof nibble, 1, fp); 78 | nbytes++; 79 | 80 | value >>= 7; 81 | } 82 | 83 | nibble = value & 0x7F; 84 | 85 | fwrite(&nibble, sizeof nibble, 1, fp); 86 | nbytes++; 87 | 88 | return nbytes; 89 | } 90 | 91 | static size_t vbyte_decode(size_t *value, FILE *fp) 92 | { 93 | size_t nbytes = 0; 94 | size_t shift = 0; 95 | uint8_t nibble; 96 | 97 | *value = 0; 98 | 99 | while (1) { 100 | fread(&nibble, sizeof nibble, 1, fp); 101 | nbytes++; 102 | 103 | *value |= ((nibble & 0x7F) << shift); 104 | 105 | shift += 7; 106 | 107 | if (nibble < 0x80) 108 | break; 109 | } 110 | 111 | return nbytes; 112 | } 113 | 114 | size_t simple9_encode(uint32_t *array, size_t n, FILE *fp) 115 | { 116 | uint32_t index; 117 | uint32_t selector; 118 | uint32_t data; 119 | uint32_t shift; 120 | 121 | size_t nbytes; 122 | size_t nitems; 123 | size_t i; 124 | 125 | assert(array); 126 | assert(n > 0); 127 | assert(fp); 128 | 129 | nbytes = vbyte_encode(n, fp); 130 | 131 | index = 0; 132 | 133 | while (index < n) { 134 | for (selector = 0; selector < NSELECTORS; selector++) { 135 | data = selector; 136 | shift = SELECTOR_BITS; 137 | nitems = 0; 138 | 139 | for (i = index; i < n; i++) { 140 | assert(array[i] <= MAX_VALUE); 141 | 142 | if (nitems == selectors[selector].nitems) 143 | break; 144 | 145 | if (array[i] > (1UL << selectors[selector].nbits) - 1) 146 | break; 147 | 148 | data |= (array[i] << shift); 149 | 150 | shift += selectors[selector].nbits; 151 | 152 | nitems++; 153 | } 154 | 155 | if (nitems == selectors[selector].nitems || index + nitems == n) { 156 | fwrite(&data, sizeof data, 1, fp); 157 | 158 | nbytes += sizeof data; 159 | 160 | index += nitems; 161 | 162 | break; 163 | } 164 | 165 | } /* End for selector ... */ 166 | 167 | } /* End while index < n */ 168 | 169 | return nbytes; 170 | } 171 | 172 | size_t simple9_decode(uint32_t **array, size_t *n, FILE *fp) 173 | { 174 | uint32_t data; 175 | uint32_t select; 176 | uint32_t mask; 177 | 178 | size_t nbytes; 179 | size_t nitems; 180 | size_t i; 181 | 182 | assert(array); 183 | assert(n); 184 | assert(fp); 185 | 186 | nbytes = vbyte_decode(n, fp); 187 | 188 | /* Look up at the sky. So many stars. It's... beautiful. */ 189 | *array = malloc(*n * sizeof **array); 190 | assert(*array); 191 | 192 | nitems = 0; 193 | 194 | while (nitems < *n) { 195 | fread(&data, sizeof data, 1, fp); 196 | 197 | nbytes += sizeof data; 198 | 199 | select = data & SELECTOR_MASK; 200 | 201 | data >>= SELECTOR_BITS; 202 | 203 | mask = (1 << selectors[select].nbits) - 1; 204 | 205 | for (i = 0; i < selectors[select].nitems; i++) { 206 | (*array)[nitems] = data & mask; 207 | 208 | nitems++; 209 | 210 | if (nitems == *n) 211 | break; 212 | 213 | data >>= selectors[select].nbits; 214 | } 215 | } 216 | 217 | return nbytes; 218 | } 219 | 220 | size_t simple9_decode_unrolled(uint32_t **array, size_t *n, FILE *fp) 221 | { 222 | uint32_t data; 223 | uint32_t select; 224 | uint32_t *ptr; 225 | 226 | size_t nbytes; 227 | size_t nitems; 228 | 229 | assert(array); 230 | assert(n); 231 | assert(fp); 232 | 233 | nbytes = vbyte_decode(n, fp); 234 | 235 | /* Due to the unrolled decoding loop there is no bounds checking. 236 | * To prevent a segmentation or bus fault during decoding we need 237 | * to allocated some extra space. The maximum offset the decoder 238 | * can run out of bounds is (the maximum elements that can be 239 | * packed into a word) - 1. */ 240 | *array = malloc((*n + selectors[0].nitems - 1) * sizeof **array); 241 | assert(*array); 242 | 243 | ptr = *array; 244 | 245 | nitems = 0; 246 | 247 | while (nitems < *n) { 248 | fread(&data, sizeof data, 1, fp); 249 | 250 | nbytes += sizeof data; 251 | 252 | select = data & SELECTOR_MASK; 253 | 254 | data >>= SELECTOR_BITS; 255 | 256 | switch (select) { 257 | case 0: /* 28 -- 1 bit elements */ 258 | ptr[nitems++] = (data) & 1; 259 | ptr[nitems++] = (data >> 1) & 1; 260 | ptr[nitems++] = (data >> 2) & 1; 261 | ptr[nitems++] = (data >> 3) & 1; 262 | ptr[nitems++] = (data >> 4) & 1; 263 | ptr[nitems++] = (data >> 5) & 1; 264 | ptr[nitems++] = (data >> 6) & 1; 265 | ptr[nitems++] = (data >> 7) & 1; 266 | ptr[nitems++] = (data >> 8) & 1; 267 | ptr[nitems++] = (data >> 9) & 1; 268 | ptr[nitems++] = (data >> 10) & 1; 269 | ptr[nitems++] = (data >> 11) & 1; 270 | ptr[nitems++] = (data >> 12) & 1; 271 | ptr[nitems++] = (data >> 13) & 1; 272 | ptr[nitems++] = (data >> 14) & 1; 273 | ptr[nitems++] = (data >> 15) & 1; 274 | ptr[nitems++] = (data >> 16) & 1; 275 | ptr[nitems++] = (data >> 17) & 1; 276 | ptr[nitems++] = (data >> 18) & 1; 277 | ptr[nitems++] = (data >> 19) & 1; 278 | ptr[nitems++] = (data >> 20) & 1; 279 | ptr[nitems++] = (data >> 21) & 1; 280 | ptr[nitems++] = (data >> 22) & 1; 281 | ptr[nitems++] = (data >> 23) & 1; 282 | ptr[nitems++] = (data >> 24) & 1; 283 | ptr[nitems++] = (data >> 25) & 1; 284 | ptr[nitems++] = (data >> 26) & 1; 285 | ptr[nitems++] = (data >> 27) & 1; 286 | break; 287 | 288 | case 1: /* 14 -- 2 bit elements */ 289 | ptr[nitems++] = (data) & 3; 290 | ptr[nitems++] = (data >> 2) & 3; 291 | ptr[nitems++] = (data >> 4) & 3; 292 | ptr[nitems++] = (data >> 6) & 3; 293 | ptr[nitems++] = (data >> 8) & 3; 294 | ptr[nitems++] = (data >> 10) & 3; 295 | ptr[nitems++] = (data >> 12) & 3; 296 | ptr[nitems++] = (data >> 14) & 3; 297 | ptr[nitems++] = (data >> 16) & 3; 298 | ptr[nitems++] = (data >> 18) & 3; 299 | ptr[nitems++] = (data >> 20) & 3; 300 | ptr[nitems++] = (data >> 22) & 3; 301 | ptr[nitems++] = (data >> 24) & 3; 302 | ptr[nitems++] = (data >> 26) & 3; 303 | break; 304 | 305 | case 2: /* 9 -- 3 bit elements (1 wasted bit) */ 306 | ptr[nitems++] = (data) & 7; 307 | ptr[nitems++] = (data >> 3) & 7; 308 | ptr[nitems++] = (data >> 6) & 7; 309 | ptr[nitems++] = (data >> 9) & 7; 310 | ptr[nitems++] = (data >> 12) & 7; 311 | ptr[nitems++] = (data >> 15) & 7; 312 | ptr[nitems++] = (data >> 18) & 7; 313 | ptr[nitems++] = (data >> 21) & 7; 314 | ptr[nitems++] = (data >> 24) & 7; 315 | break; 316 | 317 | case 3: /* 7 -- 4 bit elements */ 318 | ptr[nitems++] = (data) & 15; 319 | ptr[nitems++] = (data >> 4) & 15; 320 | ptr[nitems++] = (data >> 8) & 15; 321 | ptr[nitems++] = (data >> 12) & 15; 322 | ptr[nitems++] = (data >> 16) & 15; 323 | ptr[nitems++] = (data >> 20) & 15; 324 | ptr[nitems++] = (data >> 24) & 15; 325 | break; 326 | 327 | case 4: /* 5 -- 5 bit elements (3 wasted bits) */ 328 | ptr[nitems++] = (data) & 31; 329 | ptr[nitems++] = (data >> 5) & 31; 330 | ptr[nitems++] = (data >> 10) & 31; 331 | ptr[nitems++] = (data >> 15) & 31; 332 | ptr[nitems++] = (data >> 20) & 31; 333 | break; 334 | 335 | case 5: /* 4 -- 7 bit elements */ 336 | ptr[nitems++] = (data) & 127; 337 | ptr[nitems++] = (data >> 7) & 127; 338 | ptr[nitems++] = (data >> 14) & 127; 339 | ptr[nitems++] = (data >> 21) & 127; 340 | break; 341 | 342 | case 6: /* 3 -- 9 bit elements (1 wasted bit) */ 343 | ptr[nitems++] = (data) & 511; 344 | ptr[nitems++] = (data >> 9) & 511; 345 | ptr[nitems++] = (data >> 18) & 511; 346 | break; 347 | 348 | case 7: /* 2 -- 14 bit elements */ 349 | ptr[nitems++] = (data) & 16383; 350 | ptr[nitems++] = (data >> 14) & 16383; 351 | break; 352 | 353 | case 8: /* 1 -- 28 bit element */ 354 | ptr[nitems++] = data; 355 | break; 356 | } 357 | } 358 | 359 | return nbytes; 360 | } 361 | -------------------------------------------------------------------------------- /simple9.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Christopher Hoobin. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | 1. Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY CHRISTOPHER HOOBIN ''AS IS'' AND ANY 17 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CHRISTOPHER HOOBIN OR 20 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | 28 | The views and conclusions contained in the software and documentation 29 | are those of the authors and should not be interpreted as representing 30 | official policies, either expressed or implied, of Christopher Hoobin. 31 | */ 32 | 33 | #ifndef SIMPLE9_H 34 | #define SIMPLE9_H 35 | 36 | #include 37 | #include 38 | 39 | /* Performs a simple9 encoding of n elements from array. The result is 40 | * written to the file stream. 41 | * 42 | * Returns the number of bytes written to the file stream. */ 43 | size_t simple9_encode(uint32_t *array, size_t n, FILE *fp); 44 | 45 | /* Decodes a simple9 encoding read from a file stream. 46 | * 47 | * Memory for *array is allocated inside the function and n is also 48 | * set. The caller is responsible for freeing memory. 49 | * 50 | * Returns the number of bytes read from the file stream. */ 51 | size_t simple9_decode(uint32_t **array, size_t *n, FILE *fp); 52 | 53 | size_t simple9_decode_unrolled(uint32_t **array, size_t *n, FILE *fp); 54 | 55 | #endif /* SIMPLE9_H */ 56 | -------------------------------------------------------------------------------- /test-simple9.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "simple9.h" 9 | 10 | struct list { 11 | uint32_t *data; 12 | size_t n; 13 | }; 14 | 15 | static struct list *generate_list(size_t nitems, size_t max_value) 16 | { 17 | struct list *list; 18 | size_t i; 19 | 20 | list = malloc(sizeof *list); 21 | assert(list); 22 | 23 | list->data = malloc(nitems * sizeof(uint32_t)); 24 | assert(list->data); 25 | 26 | list->n = nitems; 27 | 28 | for (i = 0; i < nitems; i++) 29 | list->data[i] = (rand() % max_value); 30 | 31 | return list; 32 | } 33 | 34 | static void free_list(struct list *list) 35 | { 36 | free(list->data); 37 | free(list); 38 | } 39 | 40 | typedef size_t (*decode_fn_t)(uint32_t**, size_t*, FILE*); 41 | 42 | #define TMPFILE "tmpfile" 43 | 44 | static void test(struct list *list, decode_fn_t fn) 45 | { 46 | FILE *fp; 47 | uint32_t *data; 48 | size_t n; 49 | size_t i; 50 | 51 | fp = fopen(TMPFILE, "w"); 52 | assert(fp); 53 | 54 | simple9_encode(list->data, list->n, fp); 55 | 56 | fclose(fp); 57 | 58 | fp = fopen(TMPFILE, "r"); 59 | assert(fp); 60 | 61 | data = NULL; 62 | 63 | fn(&data, &n, fp); 64 | 65 | fclose(fp); 66 | 67 | if (n != list->n) 68 | fprintf(stderr, "[FAILED] n == %lu. It should equal %lu\n", 69 | (unsigned long)n, (unsigned long)list->n); 70 | 71 | for (i = 0; i < n; i++) { 72 | if (data[i] != list->data[i]) { 73 | fprintf(stderr, 74 | "[FAILED] data[%lu] = %u. It should equal %u\n", 75 | (unsigned long)i, data[i], list->data[i]); 76 | 77 | exit(EXIT_FAILURE); 78 | } 79 | } 80 | 81 | free(data); 82 | 83 | unlink(TMPFILE); 84 | } 85 | 86 | int main() 87 | { 88 | struct list *list; 89 | size_t test_nitems[] = {32, 128, 1024, 1048576, 10485760}; 90 | size_t test_max_value[] = {4, 8, 32, 128, 512, 16384, 262144, 268435456}; 91 | size_t i; 92 | size_t j; 93 | 94 | srand(time(NULL)); 95 | 96 | for (i = 0; i < sizeof test_nitems / sizeof test_nitems[0]; i++) { 97 | for (j = 0; j < sizeof test_max_value / sizeof test_max_value[0]; j++) { 98 | printf("nitems: %lu, max_value: %lu\n", 99 | (unsigned long)test_nitems[i], 100 | (unsigned long)test_max_value[j]); 101 | 102 | list = generate_list(test_nitems[i], test_max_value[j]); 103 | 104 | test(list, simple9_decode); 105 | 106 | test(list, simple9_decode_unrolled); 107 | 108 | free_list(list); 109 | } 110 | } 111 | 112 | return EXIT_SUCCESS; 113 | } 114 | --------------------------------------------------------------------------------