├── .clang-format ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── bin ├── Makefile ├── columnix_analyze.c ├── columnix_cat.c ├── columnix_dump.c └── columnix_head.c ├── contrib ├── columnix.py └── cpu-features.sh ├── lib ├── Makefile ├── avx.h ├── avx2.h ├── avx512.h ├── column.c ├── column.h ├── common.h ├── compress.c ├── compress.h ├── file.h ├── index.c ├── index.h ├── java.c ├── java.h ├── libcolumnix.pc.in ├── match.c ├── match.h ├── predicate.c ├── predicate.h ├── reader.c ├── reader.h ├── row.c ├── row.h ├── row_group.c ├── row_group.h ├── version.h ├── writer.c └── writer.h └── test ├── Makefile ├── column.c ├── compress.c ├── file.c ├── helpers.h ├── index.c ├── main.c ├── match.c ├── munit.c ├── munit.h ├── predicate.c ├── row.c ├── row_group.c └── temp_file.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | IndentWidth: 4 3 | BreakBeforeBraces: Linux 4 | AllowShortBlocksOnASingleLine: false 5 | AllowShortFunctionsOnASingleLine: false 6 | AllowShortIfStatementsOnASingleLine: false 7 | AllowShortLoopsOnASingleLine: false 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.so.* 4 | *.dylib 5 | *.a 6 | 7 | lib/libcolumnix.pc 8 | 9 | test/runner 10 | 11 | *.gcno 12 | *.gcda 13 | *.gcov 14 | coverage*.html 15 | 16 | bin/columnix_analyze 17 | bin/columnix_cat 18 | bin/columnix_dump 19 | bin/columnix_head 20 | 21 | *.pyc 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Chris O'Hara 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lib 2 | lib: 3 | $(MAKE) -C lib $@ 4 | 5 | .PHONY: test 6 | test: lib 7 | $(MAKE) -C test $@ 8 | 9 | .PHONY: clean 10 | clean: 11 | $(MAKE) -C lib clean 12 | $(MAKE) -C bin clean 13 | $(MAKE) -C test clean 14 | rm -f coverage*.html */*.gcda */*.gcno */*.gcov 15 | 16 | .PHONY: install 17 | install: 18 | $(MAKE) -C lib $@ 19 | $(MAKE) -C bin $@ 20 | 21 | .PHONY: uninstall 22 | uninstall: 23 | $(MAKE) -C bin $@ 24 | $(MAKE) -C lib $@ 25 | 26 | .PHONY: valgrind 27 | valgrind: lib 28 | $(MAKE) -C test valgrind 29 | 30 | .PHONY: analyze 31 | analyze: 32 | $(MAKE) -C lib clean 33 | CFLAGS="--analyze $(CFLAGS)" $(MAKE) -C lib compile 34 | 35 | .PHONY: sanitize 36 | sanitize: FLAGS = -fsanitize=undefined -fsanitize=address 37 | sanitize: test_with_flags 38 | 39 | .PHONY: coverage 40 | coverage: FLAGS = -coverage 41 | coverage: test_with_flags 42 | gcovr -r . --exclude=test --branch --html --html-details -o coverage.html 43 | open coverage.html 44 | 45 | .PHONY: test_with_flags 46 | test_with_flags: 47 | $(MAKE) -C lib clean 48 | $(MAKE) -C test clean 49 | CFLAGS="$(CFLAGS) $(FLAGS)" LDFLAGS="$(LDFLAGS) $(FLAGS)" $(MAKE) test 50 | 51 | .PHONY: format 52 | format: 53 | clang-format -i */*.{c,h} 54 | 55 | .PHONY: check 56 | check: test 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Columnix 2 | 3 | Columnix is a columnar storage format, similar to [Parquet][parquet] and [ORC][orc]. 4 | 5 | The experiment was to beat Parquet's read performance in [Spark][spark] for flat schemas, 6 | while simultaneously reducing the disk footprint by utilizing newer compression 7 | algorithms such as [lz4][lz4] and [zstd][zstd]. 8 | 9 | Columnix supports: 10 | 1. row groups 11 | 2. indexes (at the row group level, and file level) 12 | 3. vectorized reads 13 | 4. predicate pushdown 14 | 5. lazy reads 15 | 6. AVX2 and AVX512 predicate matching 16 | 7. memory-mapped IO 17 | 18 | Spark's Parquet reader supports 1-4, but has no support for lazy reads, only 19 | limited SIMD support (whatever the JVM provides) and IO is through HDFS. 20 | 21 | Support for complex schemas was not a goal of the project. The format has no support for 22 | Parquet's [Dremel-style][dremel-style] definition & repetition levels or ORC's 23 | [compound types][orc-types] (struct, list, map, union). 24 | 25 | The library does not currently support encoding of data prior to (or instead of) compression, 26 | for example run-length or dict encoding, despite placeholders in the code alluding to it. It was 27 | next on the TODO list, but I'd like to explore alternative approaches such as 28 | [github.com/chriso/treecomp](https://github.com/chriso/treecomp). 29 | 30 | The following bindings are provided: 31 | - Python (ctypes): [./contrib/columnix.py][py-bindings] 32 | - Spark (JNI): [chriso/columnix-spark][spark-bindings] 33 | 34 | One major caveat: the library uses `mmap` for reads. There is no HDFS compatibility and 35 | so there is limited real world use for the time being. 36 | 37 | 38 | [parquet]: https://parquet.apache.org 39 | [orc]: https://orc.apache.org 40 | [lz4]: https://lz4.github.io/lz4/ 41 | [zstd]: http://facebook.github.io/zstd/ 42 | [spark]: https://spark.apache.org 43 | [dremel-style]: https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html 44 | [orc-types]: https://orc.apache.org/docs/types.html 45 | [py-bindings]: https://github.com/chriso/columnix/blob/master/contrib/columnix.py 46 | [spark-bindings]: https://github.com/chriso/columnix-spark 47 | -------------------------------------------------------------------------------- /bin/Makefile: -------------------------------------------------------------------------------- 1 | PREFIX ?= /usr/local 2 | BINDIR ?= $(PREFIX)/bin 3 | 4 | CFLAGS += -std=c99 -Os -g -pedantic -Wall 5 | LDLIBS = -lcolumnix 6 | 7 | SRC = $(wildcard *.c) 8 | OBJ = $(SRC:.c=.o) 9 | BIN = $(basename $(SRC)) 10 | 11 | $(BIN): $(OBJ) 12 | $(CC) $(LDFLAGS) -o $@ $@.o $(LDLIBS) 13 | 14 | .c.o: 15 | $(CC) $(CFLAGS) -c -o $@ $< 16 | 17 | clean: 18 | $(RM) $(BIN) $(OBJ) 19 | 20 | install: $(BIN) 21 | install -m 755 $(BIN) $(BINDIR) 22 | 23 | uninstall: 24 | $(RM) $(addprefix $(BINDIR)/,$(BIN)) 25 | 26 | .PHONY: clean install uninstall 27 | -------------------------------------------------------------------------------- /bin/columnix_analyze.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | const char *unknown_str = " UNKNOWN"; 7 | 8 | static const char *type_str(enum cx_column_type); 9 | static const char *encoding_str(enum cx_encoding_type); 10 | static const char *compression_str(enum cx_compression_type); 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | struct cx_reader *reader = NULL; 15 | 16 | if (argc != 2) { 17 | fprintf(stderr, "usage: %s \n", argv[0]); 18 | return 1; 19 | } 20 | 21 | const char *path = argv[1]; 22 | reader = cx_reader_new(path); 23 | if (!reader) { 24 | fprintf(stderr, "error: unable to open %s\n", path); 25 | return 1; 26 | } 27 | 28 | size_t column_count = cx_reader_column_count(reader); 29 | 30 | printf("Summary:\n"); 31 | printf(" - path: %s\n", path); 32 | printf(" - rows: %zu\n", cx_reader_row_count(reader)); 33 | printf(" - columns: %zu\n", column_count); 34 | 35 | for (size_t i = 0; i < column_count; i++) { 36 | enum cx_column_type type = cx_reader_column_type(reader, i); 37 | enum cx_encoding_type encoding = cx_reader_column_encoding(reader, i); 38 | int level = 0; 39 | enum cx_compression_type compression = 40 | cx_reader_column_compression(reader, i, &level); 41 | 42 | printf("\nColumn %zu:\n", i); 43 | printf(" - name: %s\n", cx_reader_column_name(reader, i)); 44 | printf(" - type: %s\n", type_str(type)); 45 | if (encoding) 46 | printf(" - type: %s\n", encoding_str(encoding)); 47 | if (compression) 48 | printf(" - compression: %s (level %d)\n", 49 | compression_str(compression), level); 50 | } 51 | 52 | cx_reader_free(reader); 53 | 54 | return 0; 55 | } 56 | 57 | static const char *type_str(enum cx_column_type type) 58 | { 59 | switch (type) { 60 | case CX_COLUMN_BIT: 61 | return "BIT"; 62 | case CX_COLUMN_I32: 63 | return "I32"; 64 | case CX_COLUMN_I64: 65 | return "I64"; 66 | case CX_COLUMN_FLT: 67 | return "FLT"; 68 | case CX_COLUMN_DBL: 69 | return "DBL"; 70 | case CX_COLUMN_STR: 71 | return "STR"; 72 | default: 73 | break; 74 | } 75 | return unknown_str; 76 | } 77 | 78 | static const char *encoding_str(enum cx_encoding_type type) 79 | { 80 | return unknown_str; 81 | } 82 | 83 | static const char *compression_str(enum cx_compression_type type) 84 | { 85 | switch (type) { 86 | case CX_COMPRESSION_LZ4: 87 | return "LZ4"; 88 | case CX_COMPRESSION_LZ4HC: 89 | return "LZ4HC"; 90 | case CX_COMPRESSION_ZSTD: 91 | return "ZSTD"; 92 | default: 93 | break; 94 | } 95 | return unknown_str; 96 | } 97 | -------------------------------------------------------------------------------- /bin/columnix_cat.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | static void show_usage(const char *name) 9 | { 10 | fprintf(stderr, "usage: %s [--row-group-size=N] [--help] OUTPUT INPUT...\n", 11 | name); 12 | exit(0); 13 | } 14 | 15 | static bool cx_reader_cat(struct cx_writer *writer, struct cx_reader *reader, 16 | enum cx_column_type types[]); 17 | 18 | int main(int argc, char *argv[]) 19 | { 20 | static struct option options[] = { 21 | {"row-group-size", required_argument, 0, 'r'}, 22 | {"help", no_argument, 0, 'h'}, 23 | {0, 0, 0, 0}}; 24 | 25 | const char *name = argv[0]; 26 | 27 | bool sync = true; 28 | size_t row_group_size = 100000; 29 | 30 | const char *row_group_size_arg = NULL; 31 | 32 | for (;;) { 33 | int index = 0; 34 | int c = getopt_long(argc, argv, "hr:", options, &index); 35 | if (c == -1) 36 | break; 37 | switch (c) { 38 | case 0: 39 | if (options[index].flag) 40 | break; 41 | switch (index) { 42 | case 0: 43 | row_group_size_arg = optarg; 44 | break; 45 | case 1: 46 | show_usage(name); 47 | break; 48 | default: 49 | abort(); 50 | } 51 | break; 52 | case 'r': 53 | row_group_size_arg = optarg; 54 | break; 55 | case 'h': 56 | case '?': 57 | show_usage(name); 58 | break; 59 | default: 60 | abort(); 61 | } 62 | } 63 | 64 | argc -= optind; 65 | argv += optind; 66 | 67 | if (argc < 2) 68 | show_usage(name); 69 | 70 | if (row_group_size_arg) { 71 | char *row_group_size_end = NULL; 72 | row_group_size = strtoull(row_group_size_arg, &row_group_size_end, 10); 73 | if (row_group_size_end[0] != '\0') { 74 | fprintf(stderr, "error: invalid row group size: %s\n", 75 | row_group_size_arg); 76 | return 1; 77 | } 78 | } 79 | 80 | const char *output_path = argv[0]; 81 | 82 | fprintf(stderr, "Writing to %s with row group size %zu\n", output_path, 83 | row_group_size); 84 | 85 | enum cx_column_type *types = NULL; 86 | 87 | struct cx_reader *reader = NULL; 88 | struct cx_writer *writer = cx_writer_new(output_path, row_group_size); 89 | if (!writer) { 90 | fprintf(stderr, "Failed to create writer for path %s\n", output_path); 91 | goto error; 92 | } 93 | 94 | size_t writer_column_count = 0; 95 | 96 | for (int i = 1; i < argc; i++) { 97 | const char *input_path = argv[i]; 98 | fprintf(stderr, "Adding %s to %s\n", input_path, output_path); 99 | 100 | bool first_input = i == 1; 101 | 102 | reader = cx_reader_new(input_path); 103 | if (!reader) { 104 | fprintf(stderr, "Failed to create reader for path %s\n", 105 | input_path); 106 | goto error; 107 | } 108 | 109 | size_t column_count = cx_reader_column_count(reader); 110 | if (!column_count) { 111 | fprintf(stderr, "error: no columns in %s\n", input_path); 112 | goto error; 113 | } 114 | 115 | if (first_input) { 116 | writer_column_count = column_count; 117 | types = malloc(column_count * sizeof(*types)); 118 | if (!types) { 119 | fprintf(stderr, "error: no memory\n"); 120 | goto error; 121 | } 122 | } else if (writer_column_count != column_count) { 123 | fprintf(stderr, "error: column count mismatch\n"); 124 | goto error; 125 | } 126 | 127 | for (size_t j = 0; j < column_count; j++) { 128 | enum cx_column_type type = cx_reader_column_type(reader, j); 129 | if (first_input) { 130 | types[j] = type; 131 | int level; 132 | enum cx_compression_type compression = 133 | cx_reader_column_compression(reader, j, &level); 134 | if (!cx_writer_add_column( 135 | writer, cx_reader_column_name(reader, j), type, 136 | cx_reader_column_encoding(reader, j), compression, 137 | level)) { 138 | fprintf(stderr, "Failed to add column %zu from %s\n", j, 139 | input_path); 140 | goto error; 141 | } 142 | } else if (type != types[j]) { 143 | fprintf(stderr, "error: type mismatch for column %zu of %s\n", 144 | j, input_path); 145 | goto error; 146 | } 147 | } 148 | 149 | if (!cx_reader_cat(writer, reader, types)) { 150 | fprintf(stderr, "error: failed to cat %s\n", input_path); 151 | goto error; 152 | } 153 | 154 | cx_reader_free(reader); 155 | } 156 | 157 | if (!cx_writer_finish(writer, sync)) { 158 | fprintf(stderr, "error: failed to finish writes\n"); 159 | goto error; 160 | } 161 | 162 | if (types) 163 | free(types); 164 | 165 | cx_writer_free(writer); 166 | 167 | return 0; 168 | 169 | error: 170 | if (types) 171 | free(types); 172 | if (writer) 173 | cx_writer_free(writer); 174 | if (reader) 175 | cx_reader_free(reader); 176 | return 1; 177 | } 178 | 179 | static bool cx_reader_cat(struct cx_writer *writer, struct cx_reader *reader, 180 | enum cx_column_type types[]) 181 | { 182 | size_t column_count = cx_reader_column_count(reader); 183 | while (cx_reader_next(reader)) { 184 | for (size_t j = 0; j < column_count; j++) { 185 | cx_value_t value; 186 | if (!cx_reader_get_null(reader, j, &value.bit)) 187 | goto error; 188 | if (value.bit) { 189 | if (!cx_writer_put_null(writer, j)) 190 | goto error; 191 | } else { 192 | switch (types[j]) { 193 | case CX_COLUMN_BIT: 194 | if (!cx_reader_get_bit(reader, j, &value.bit)) 195 | goto error; 196 | if (!cx_writer_put_bit(writer, j, value.bit)) 197 | goto error; 198 | break; 199 | case CX_COLUMN_I32: 200 | if (!cx_reader_get_i32(reader, j, &value.i32)) 201 | goto error; 202 | if (!cx_writer_put_i32(writer, j, value.i32)) 203 | goto error; 204 | break; 205 | case CX_COLUMN_I64: 206 | if (!cx_reader_get_i64(reader, j, &value.i64)) 207 | goto error; 208 | if (!cx_writer_put_i64(writer, j, value.i64)) 209 | goto error; 210 | break; 211 | case CX_COLUMN_FLT: 212 | if (!cx_reader_get_flt(reader, j, &value.flt)) 213 | goto error; 214 | if (!cx_writer_put_flt(writer, j, value.flt)) 215 | goto error; 216 | break; 217 | case CX_COLUMN_DBL: 218 | if (!cx_reader_get_dbl(reader, j, &value.dbl)) 219 | goto error; 220 | if (!cx_writer_put_dbl(writer, j, value.dbl)) 221 | goto error; 222 | break; 223 | case CX_COLUMN_STR: 224 | if (!cx_reader_get_str(reader, j, &value.str)) 225 | goto error; 226 | if (!cx_writer_put_str(writer, j, value.str.ptr)) 227 | goto error; 228 | break; 229 | } 230 | } 231 | } 232 | } 233 | return !cx_reader_error(reader); 234 | error: 235 | return false; 236 | } 237 | -------------------------------------------------------------------------------- /bin/columnix_dump.c: -------------------------------------------------------------------------------- 1 | #define __STDC_FORMAT_MACROS 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | const char *null_string = "(null)"; 9 | 10 | static bool cx_print_value(struct cx_reader *reader, size_t column, 11 | enum cx_column_type type); 12 | 13 | int main(int argc, char *argv[]) 14 | { 15 | struct cx_reader *reader = NULL; 16 | 17 | if (argc != 3) { 18 | fprintf(stderr, "usage: %s \n", argv[0]); 19 | goto error; 20 | } 21 | 22 | reader = cx_reader_new(argv[1]); 23 | if (!reader) { 24 | fprintf(stderr, "error: unable to open '%s'\n", argv[1]); 25 | goto error; 26 | } 27 | 28 | int column = atoi(argv[2]); 29 | if (column < 0 || column >= cx_reader_column_count(reader)) { 30 | fprintf(stderr, "error: invalid column index '%s'\n", argv[2]); 31 | goto error; 32 | } 33 | 34 | enum cx_column_type type = cx_reader_column_type(reader, column); 35 | 36 | while (cx_reader_next(reader)) { 37 | if (!cx_print_value(reader, column, type)) { 38 | fprintf(stderr, "error: failed to read column value\n"); 39 | goto error; 40 | } 41 | } 42 | 43 | if (cx_reader_error(reader)) { 44 | fprintf(stderr, "error: iter\n"); 45 | goto error; 46 | } 47 | 48 | cx_reader_free(reader); 49 | 50 | return 0; 51 | 52 | error: 53 | if (reader) 54 | cx_reader_free(reader); 55 | return 1; 56 | } 57 | 58 | static bool cx_print_value(struct cx_reader *reader, size_t column, 59 | enum cx_column_type type) 60 | { 61 | cx_value_t value; 62 | if (!cx_reader_get_null(reader, column, &value.bit)) 63 | goto error; 64 | if (value.bit) { 65 | puts(null_string); 66 | } else { 67 | switch (type) { 68 | case CX_COLUMN_BIT: 69 | if (!cx_reader_get_bit(reader, column, &value.bit)) 70 | goto error; 71 | puts(value.bit ? "1" : "0"); 72 | break; 73 | case CX_COLUMN_I32: 74 | if (!cx_reader_get_i32(reader, column, &value.i32)) 75 | goto error; 76 | printf("%d\n", value.i32); 77 | break; 78 | case CX_COLUMN_I64: 79 | if (!cx_reader_get_i64(reader, column, &value.i64)) 80 | goto error; 81 | printf("%" PRIi64 "\n", value.i64); 82 | break; 83 | case CX_COLUMN_FLT: 84 | if (!cx_reader_get_flt(reader, column, &value.flt)) 85 | goto error; 86 | printf("%f\n", value.flt); 87 | break; 88 | case CX_COLUMN_DBL: 89 | if (!cx_reader_get_dbl(reader, column, &value.dbl)) 90 | goto error; 91 | printf("%f\n", value.dbl); 92 | break; 93 | case CX_COLUMN_STR: 94 | if (!cx_reader_get_str(reader, column, &value.str)) 95 | goto error; 96 | printf("%s\n", value.str.ptr); 97 | break; 98 | } 99 | } 100 | return true; 101 | error: 102 | return false; 103 | } 104 | -------------------------------------------------------------------------------- /bin/columnix_head.c: -------------------------------------------------------------------------------- 1 | #define __STDC_FORMAT_MACROS 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | const char *null_string = "(null)"; 9 | 10 | static bool cx_print_value(struct cx_reader *reader, size_t column); 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | struct cx_reader *reader = NULL; 15 | 16 | size_t count = 20; 17 | 18 | if (argc != 2) { 19 | fprintf(stderr, "usage: %s \n", argv[0]); 20 | goto error; 21 | } 22 | 23 | reader = cx_reader_new(argv[1]); 24 | if (!reader) { 25 | fprintf(stderr, "error: unable to open '%s'\n", argv[1]); 26 | goto error; 27 | } 28 | 29 | size_t column_count = cx_reader_column_count(reader); 30 | for (size_t i = 0; i < column_count; i++) { 31 | if (i) 32 | printf("\t"); 33 | const char *name = cx_reader_column_name(reader, i); 34 | if (!name) { 35 | fprintf(stderr, "error: failed to read column name\n"); 36 | goto error; 37 | } 38 | printf("%s", name); 39 | } 40 | printf("\n"); 41 | 42 | while (cx_reader_next(reader) && count--) { 43 | for (size_t i = 0; i < column_count; i++) { 44 | if (i) 45 | printf("\t"); 46 | if (!cx_print_value(reader, i)) { 47 | fprintf(stderr, "error: failed to read column value\n"); 48 | goto error; 49 | } 50 | } 51 | printf("\n"); 52 | } 53 | 54 | if (cx_reader_error(reader)) { 55 | fprintf(stderr, "error: iter\n"); 56 | goto error; 57 | } 58 | 59 | cx_reader_free(reader); 60 | 61 | return 0; 62 | 63 | error: 64 | if (reader) 65 | cx_reader_free(reader); 66 | return 1; 67 | } 68 | 69 | static bool cx_print_value(struct cx_reader *reader, size_t column) 70 | { 71 | cx_value_t value; 72 | if (!cx_reader_get_null(reader, column, &value.bit)) 73 | goto error; 74 | if (value.bit) { 75 | printf("%s", null_string); 76 | } else { 77 | switch (cx_reader_column_type(reader, column)) { 78 | case CX_COLUMN_BIT: 79 | if (!cx_reader_get_bit(reader, column, &value.bit)) 80 | goto error; 81 | printf("%d", value.bit ? 1 : 0); 82 | break; 83 | case CX_COLUMN_I32: 84 | if (!cx_reader_get_i32(reader, column, &value.i32)) 85 | goto error; 86 | printf("%d", value.i32); 87 | break; 88 | case CX_COLUMN_I64: 89 | if (!cx_reader_get_i64(reader, column, &value.i64)) 90 | goto error; 91 | printf("%" PRIi64, value.i64); 92 | break; 93 | case CX_COLUMN_FLT: 94 | if (!cx_reader_get_flt(reader, column, &value.flt)) 95 | goto error; 96 | printf("%f", value.flt); 97 | break; 98 | case CX_COLUMN_DBL: 99 | if (!cx_reader_get_dbl(reader, column, &value.dbl)) 100 | goto error; 101 | printf("%f", value.dbl); 102 | break; 103 | case CX_COLUMN_STR: 104 | if (!cx_reader_get_str(reader, column, &value.str)) 105 | goto error; 106 | printf("%s", value.str.ptr); 107 | break; 108 | } 109 | } 110 | return true; 111 | error: 112 | return false; 113 | } 114 | -------------------------------------------------------------------------------- /contrib/columnix.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python bindings for columnix. 3 | 4 | Example write (Python): 5 | 6 | from columnix import Writer, Column, I64, I32, STR, LZ4, ZSTD 7 | 8 | columns = [Column(I64, "timestamp", compression=LZ4), 9 | Column(STR, "email", compression=ZSTD), 10 | Column(I32, "id")] 11 | 12 | rows = [(1400000000000, "foo@bar.com", 23), 13 | (1400000001000, "foo@bar.com", 45), 14 | (1400000002000, "baz@bar.com", 67)] 15 | 16 | with Writer("example.cx", columns, row_group_size=2) as writer: 17 | for row in rows: 18 | writer.put(row) 19 | 20 | Example read (C): 21 | 22 | #define __STDC_FORMAT_MACROS 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | int main() 29 | { 30 | struct cx_reader *reader = cx_reader_new("example.cx"); 31 | assert(reader); 32 | int64_t timestamp; 33 | const struct cx_string *email; 34 | int32_t event; 35 | while (cx_reader_next(reader)) { 36 | assert(cx_reader_get_i64(reader, 0, ×tamp) && 37 | cx_reader_get_str(reader, 1, &email) && 38 | cx_reader_get_i32(reader, 2, &event)); 39 | printf("{%" PRIi64 ", %s, %d}\n", timestamp, email->ptr, event); 40 | } 41 | assert(!cx_reader_error(reader)); 42 | cx_reader_free(reader); 43 | } 44 | """ 45 | 46 | from ctypes import cdll, util 47 | from ctypes import (c_char_p, c_size_t, c_void_p, c_int, c_int32, c_int64, 48 | c_bool, c_float, c_double) 49 | 50 | lib = cdll.LoadLibrary(util.find_library("columnix")) 51 | 52 | cx_writer_new = lib.cx_writer_new 53 | cx_writer_new.argtypes = [c_char_p, c_size_t] 54 | cx_writer_new.restype = c_void_p 55 | 56 | cx_writer_free = lib.cx_writer_free 57 | cx_writer_free.argtypes = [c_void_p] 58 | 59 | cx_writer_add_column = lib.cx_writer_add_column 60 | cx_writer_add_column.argtypes = [c_void_p, c_char_p, c_int, c_int, c_int, c_int] 61 | 62 | cx_writer_put_null = lib.cx_writer_put_null 63 | cx_writer_put_null.argtypes = [c_void_p, c_size_t] 64 | 65 | cx_writer_put_bit = lib.cx_writer_put_bit 66 | cx_writer_put_bit.argtypes = [c_void_p, c_size_t, c_bool] 67 | 68 | cx_writer_put_i32 = lib.cx_writer_put_i32 69 | cx_writer_put_i32.argtypes = [c_void_p, c_size_t, c_int32] 70 | 71 | cx_writer_put_i64 = lib.cx_writer_put_i64 72 | cx_writer_put_i64.argtypes = [c_void_p, c_size_t, c_int64] 73 | 74 | cx_writer_put_flt = lib.cx_writer_put_flt 75 | cx_writer_put_flt.argtypes = [c_void_p, c_size_t, c_float] 76 | 77 | cx_writer_put_dbl = lib.cx_writer_put_dbl 78 | cx_writer_put_dbl.argtypes = [c_void_p, c_size_t, c_double] 79 | 80 | cx_writer_put_str = lib.cx_writer_put_str 81 | cx_writer_put_str.argtypes = [c_void_p, c_size_t, c_char_p] 82 | 83 | cx_writer_finish = lib.cx_writer_finish 84 | cx_writer_finish.argtypes = [c_void_p, c_bool] 85 | 86 | BIT = 0 87 | I32 = 1 88 | I64 = 2 89 | FLT = 3 90 | DBL = 4 91 | STR = 5 92 | 93 | LZ4 = 1 94 | LZ4HC = 2 95 | ZSTD = 3 96 | 97 | 98 | class Column(object): 99 | def __init__(self, type, name, encoding=None, compression=None, level=1): 100 | self.type = type 101 | self.name = name 102 | self.encoding = encoding or 0 103 | self.compression = compression or 0 104 | self.level = level 105 | 106 | 107 | class Writer(object): 108 | def __init__(self, path, columns, row_group_size=100000, sync=True): 109 | self.path = path 110 | self.columns = columns 111 | self.row_group_size = row_group_size 112 | self.sync = sync 113 | put_fn = [self._put_bit, self._put_i32, self._put_i64, self._put_flt, 114 | self._put_dbl, self._put_str] 115 | self.put_lookup = [put_fn[column.type] for column in columns] 116 | self.writer = None 117 | 118 | def __enter__(self): 119 | assert self.writer is None 120 | self.writer = cx_writer_new(self.path, self.row_group_size) 121 | if not self.writer: 122 | raise RuntimeError("failed to create writer for %s" % self.path) 123 | for column in self.columns: 124 | if not cx_writer_add_column(self.writer, column.name, column.type, 125 | column.encoding, column.compression, 126 | column.level): 127 | raise RuntimeError("failed to add column") 128 | return self 129 | 130 | def __exit__(self, err, value, traceback): 131 | assert self.writer is not None 132 | if not err: 133 | cx_writer_finish(self.writer, self.sync) 134 | cx_writer_free(self.writer) 135 | self.writer = None 136 | 137 | def put(self, row): 138 | assert self.writer is not None 139 | put_lookup = self.put_lookup 140 | put_null = self._put_null 141 | for column, value in enumerate(row): 142 | if value is None: 143 | put_null(column) 144 | else: 145 | put_lookup[column](column, value) 146 | 147 | def _put_null(self, column): 148 | if not cx_writer_put_null(self.writer, column): 149 | raise RuntimeError("put_null(%d)" % column) 150 | 151 | def _put_bit(self, column, value): 152 | if not cx_writer_put_bit(self.writer, column, value): 153 | raise RuntimeError("put_bit(%d, %r)" % (column, value)) 154 | 155 | def _put_i32(self, column, value): 156 | if not cx_writer_put_i32(self.writer, column, value): 157 | raise RuntimeError("put_i32(%d, %r)" % (column, value)) 158 | 159 | def _put_i64(self, column, value): 160 | if not cx_writer_put_i64(self.writer, column, value): 161 | raise RuntimeError("put_i64(%d, %r)" % (column, value)) 162 | 163 | def _put_flt(self, column, value): 164 | if not cx_writer_put_flt(self.writer, column, value): 165 | raise RuntimeError("put_flt(%d, %r)" % (column, value)) 166 | 167 | def _put_dbl(self, column, value): 168 | if not cx_writer_put_dbl(self.writer, column, value): 169 | raise RuntimeError("put_dbl(%d, %r)" % (column, value)) 170 | 171 | def _put_str(self, column, value): 172 | if not cx_writer_put_str(self.writer, column, value): 173 | raise RuntimeError("put_str(%d, %r)" % (column, value)) 174 | -------------------------------------------------------------------------------- /contrib/cpu-features.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$(uname -s)" = Darwin ]; then 4 | echo $(sysctl -n machdep.cpu.features) $(sysctl -n machdep.cpu.leaf7_features) | tr . _ 5 | else 6 | cat /proc/cpuinfo | grep flags | head -1 | cut -d: -f2- | tr 'a-z' 'A-Z' 7 | fi 8 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | PROJECT = columnix 2 | 3 | # these are also set in version.h 4 | VERSION_MAJOR = 1 5 | VERSION_MINOR = 0 6 | VERSION_RELEASE = 0 7 | VERSION = $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_RELEASE) 8 | 9 | PREFIX ?= /usr/local 10 | LIBDIR ?= $(PREFIX)/lib 11 | INCLUDEDIR ?= $(PREFIX)/include 12 | PKGCONFIGDIR ?= $(LIBDIR)/pkgconfig 13 | 14 | LDLIBS = -llz4 -lzstd 15 | 16 | CFLAGS += -std=c99 -g -pedantic -Wall -pthread -fvisibility=hidden 17 | LDFLAGS += -fvisibility=hidden 18 | 19 | OPTFLAGS ?= -O3 -march=native 20 | 21 | SRC = column.c compress.c index.c match.c predicate.c \ 22 | reader.c row.c row_group.c writer.c 23 | 24 | HEADERS = column.h common.h compress.h file.h index.h \ 25 | predicate.h reader.h row.h row_group.h version.h writer.h 26 | 27 | ifeq ($(java), 1) 28 | JAVA_HOME := $(shell /usr/libexec/java_home) 29 | JAVA_OS := $(shell uname | tr A-Z a-z) 30 | CFLAGS += -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/$(JAVA_OS) 31 | SRC += java.c 32 | endif 33 | 34 | LIBNAME = lib$(PROJECT) 35 | 36 | ifeq ($(shell uname), Darwin) 37 | LIBEXT = dylib 38 | SHARED_LIB_VERSION = $(LIBNAME).$(VERSION).$(LIBEXT) 39 | SHARED_LIB_MAJOR = $(LIBNAME).$(VERSION_MAJOR).$(LIBEXT) 40 | SHARED_LIB = $(LIBNAME).$(LIBEXT) 41 | SONAME_FLAGS = -install_name $(LIBDIR)/$(SHARED_LIB_MAJOR) \ 42 | -compatibility_version $(VERSION_MAJOR) \ 43 | -current_version $(VERSION) 44 | else 45 | CFLAGS += -fPIC 46 | LDFLAGS += -pthread 47 | LIBEXT = so 48 | SHARED_LIB_VERSION = $(LIBNAME).$(LIBEXT).$(VERSION) 49 | SHARED_LIB_MAJOR = $(LIBNAME).$(LIBEXT).$(VERSION_MAJOR) 50 | SONAME_FLAGS = -Wl,-soname=$(SHARED_LIB_VERSION) 51 | endif 52 | 53 | SHARED_LIB = $(LIBNAME).$(LIBEXT) 54 | STATIC_LIB = $(LIBNAME).a 55 | 56 | PKGCONFIG_FILE = $(LIBNAME).pc 57 | 58 | OBJ := $(SRC:.c=.o) 59 | 60 | ifneq ($(debug), 1) 61 | CFLAGS += $(OPTFLAGS) 62 | CPU_FEATURES := $(shell ../contrib/cpu-features.sh) 63 | ifneq (,$(findstring AVX512F,$(CPU_FEATURES))) 64 | CFLAGS += -DCX_AVX512 -mavx512f 65 | else ifneq (,$(findstring AVX2,$(CPU_FEATURES))) 66 | CFLAGS += -DCX_AVX2 -mavx2 67 | else ifneq (,$(findstring AVX,$(CPU_FEATURES))) 68 | CFLAGS += -DCX_AVX -mavx 69 | endif 70 | ifneq (,$(findstring SSE4_2,$(CPU_FEATURES))) 71 | CFLAGS += -DCX_SSE42 -msse4.2 72 | endif 73 | endif 74 | 75 | lib: $(SHARED_LIB_VERSION) $(STATIC_LIB) 76 | 77 | $(STATIC_LIB): $(OBJ) 78 | $(AR) rcs $@ $^ 79 | 80 | $(SHARED_LIB_VERSION): $(OBJ) 81 | $(CC) $^ $(LDFLAGS) $(LDLIBS) $(SONAME_FLAGS) -shared -o $@ 82 | 83 | .c.o: 84 | $(CC) $(CFLAGS) -c -o $@ $< 85 | 86 | $(PKGCONFIG_FILE): $(PKGCONFIG_FILE).in 87 | sed -e 's|@PREFIX@|$(PREFIX)|' \ 88 | -e 's|@LIBDIR@|$(LIBDIR)|' \ 89 | -e 's|@INCLUDEDIR@|$(INCLUDEDIR)|' \ 90 | -e 's|@VERSION@|$(VERSION)|' \ 91 | $< >$@ 92 | 93 | compile: $(OBJ) 94 | 95 | clean: 96 | $(RM) $(OBJ) $(STATIC_LIB) $(LIBNAME).dylib $(LIBNAME).so.* $(PKGCONFIG_FILE) 97 | 98 | install: lib $(PKGCONFIG_FILE) 99 | install -m 755 $(SHARED_LIB_VERSION) $(STATIC_LIB) $(LIBDIR)/ 100 | ln -sf $(SHARED_LIB_VERSION) $(LIBDIR)/$(SHARED_LIB) 101 | ln -sf $(SHARED_LIB_VERSION) $(LIBDIR)/$(SHARED_LIB_MAJOR) 102 | install -d $(INCLUDEDIR)/$(PROJECT)/ 103 | install -m 644 $(HEADERS) $(INCLUDEDIR)/$(PROJECT)/ 104 | install -m 644 $(PKGCONFIG_FILE) $(PKGCONFIGDIR)/ 105 | 106 | uninstall: 107 | $(RM) $(addprefix $(LIBDIR)/,$(SHARED_LIB) $(SHARED_LIB_VERSION) $(SHARED_LIB_MAJOR) $(STATIC_LIB)) 108 | $(RM) $(PKGCONFIGDIR)/$(PKGCONFIG_FILE) 109 | $(RM) $(INCLUDEDIR)/$(PROJECT)/*.h 110 | rmdir $(INCLUDEDIR)/$(PROJECT) 111 | 112 | .PHONY: compile clean install uninstall 113 | -------------------------------------------------------------------------------- /lib/avx.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef __m128i cx_i32_vec_t; 4 | typedef __m128i cx_i64_vec_t; 5 | typedef __m128 cx_flt_vec_t; 6 | typedef __m128d cx_dbl_vec_t; 7 | 8 | static inline __m128i cx_simd_i32_set(int32_t value) 9 | { 10 | return _mm_set1_epi32(value); 11 | } 12 | 13 | static inline __m128i cx_simd_i32_load(const int32_t *ptr) 14 | { 15 | return _mm_loadu_si128((__m128i *)ptr); 16 | } 17 | 18 | static inline int cx_simd_i32_mask(__m128i vec) 19 | { 20 | return _mm_movemask_ps((__m128)vec); 21 | } 22 | 23 | static inline int cx_simd_i32_eq(__m128i a, __m128i b) 24 | { 25 | return cx_simd_i32_mask(_mm_cmpeq_epi32(a, b)); 26 | } 27 | 28 | static inline int cx_simd_i32_lt(__m128i a, __m128i b) 29 | { 30 | return cx_simd_i32_mask(_mm_cmpgt_epi32(a, b)); 31 | } 32 | 33 | static inline int cx_simd_i32_gt(__m128i a, __m128i b) 34 | { 35 | return cx_simd_i32_mask(_mm_cmpgt_epi32(b, a)); 36 | } 37 | 38 | static inline __m128i cx_simd_i64_set(int64_t value) 39 | { 40 | return _mm_set1_epi64x(value); 41 | } 42 | 43 | static inline __m128i cx_simd_i64_load(const int64_t *ptr) 44 | { 45 | return _mm_loadu_si128((__m128i *)ptr); 46 | } 47 | 48 | static inline int cx_simd_i64_mask(__m128i vec) 49 | { 50 | return _mm_movemask_pd((__m128d)vec); 51 | } 52 | 53 | static inline int cx_simd_i64_eq(__m128i a, __m128i b) 54 | { 55 | return cx_simd_i64_mask(_mm_cmpeq_epi64(a, b)); 56 | } 57 | 58 | static inline int cx_simd_i64_lt(__m128i a, __m128i b) 59 | { 60 | return cx_simd_i64_mask(_mm_cmpgt_epi64(a, b)); 61 | } 62 | 63 | static inline int cx_simd_i64_gt(__m128i a, __m128i b) 64 | { 65 | return cx_simd_i64_mask(_mm_cmpgt_epi64(b, a)); 66 | } 67 | 68 | static inline __m128 cx_simd_flt_set(float value) 69 | { 70 | return _mm_set1_ps(value); 71 | } 72 | 73 | static inline __m128 cx_simd_flt_load(const float *ptr) 74 | { 75 | return _mm_loadu_ps(ptr); 76 | } 77 | 78 | static inline int cx_simd_flt_mask(__m128 vec) 79 | { 80 | return _mm_movemask_ps(vec); 81 | } 82 | 83 | static inline int cx_simd_flt_eq(__m128 a, __m128 b) 84 | { 85 | return cx_simd_flt_mask(_mm_cmp_ps(a, b, _CMP_EQ_OQ)); 86 | } 87 | 88 | static inline int cx_simd_flt_lt(__m128 a, __m128 b) 89 | { 90 | return cx_simd_flt_mask(_mm_cmp_ps(b, a, _CMP_LT_OQ)); 91 | } 92 | 93 | static inline int cx_simd_flt_gt(__m128 a, __m128 b) 94 | { 95 | return cx_simd_flt_mask(_mm_cmp_ps(b, a, _CMP_GT_OQ)); 96 | } 97 | 98 | static inline __m128d cx_simd_dbl_set(double value) 99 | { 100 | return _mm_set1_pd(value); 101 | } 102 | 103 | static inline __m128d cx_simd_dbl_load(const double *ptr) 104 | { 105 | return _mm_loadu_pd(ptr); 106 | } 107 | 108 | static inline int cx_simd_dbl_mask(__m128d vec) 109 | { 110 | return _mm_movemask_pd(vec); 111 | } 112 | 113 | static inline int cx_simd_dbl_eq(__m128d a, __m128d b) 114 | { 115 | return cx_simd_dbl_mask(_mm_cmp_pd(a, b, _CMP_EQ_OQ)); 116 | } 117 | 118 | static inline int cx_simd_dbl_lt(__m128d a, __m128d b) 119 | { 120 | return cx_simd_dbl_mask(_mm_cmp_pd(b, a, _CMP_LT_OQ)); 121 | } 122 | 123 | static inline int cx_simd_dbl_gt(__m128d a, __m128d b) 124 | { 125 | return cx_simd_dbl_mask(_mm_cmp_pd(b, a, _CMP_GT_OQ)); 126 | } 127 | -------------------------------------------------------------------------------- /lib/avx2.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef __m256i cx_i32_vec_t; 4 | typedef __m256i cx_i64_vec_t; 5 | typedef __m256 cx_flt_vec_t; 6 | typedef __m256d cx_dbl_vec_t; 7 | 8 | static inline __m256i cx_simd_i32_set(int32_t value) 9 | { 10 | return _mm256_set1_epi32(value); 11 | } 12 | 13 | static inline __m256i cx_simd_i32_load(const int32_t *ptr) 14 | { 15 | return _mm256_loadu_si256((__m256i *)ptr); 16 | } 17 | 18 | static inline int cx_simd_i32_mask(__m256i vec) 19 | { 20 | return _mm256_movemask_ps((__m256)vec); 21 | } 22 | 23 | static inline int cx_simd_i32_eq(__m256i a, __m256i b) 24 | { 25 | return cx_simd_i32_mask(_mm256_cmpeq_epi32(a, b)); 26 | } 27 | 28 | static inline int cx_simd_i32_lt(__m256i a, __m256i b) 29 | { 30 | return cx_simd_i32_mask(_mm256_cmpgt_epi32(a, b)); 31 | } 32 | 33 | static inline int cx_simd_i32_gt(__m256i a, __m256i b) 34 | { 35 | return cx_simd_i32_mask(_mm256_cmpgt_epi32(b, a)); 36 | } 37 | 38 | static inline __m256i cx_simd_i64_set(int64_t value) 39 | { 40 | return _mm256_set1_epi64x(value); 41 | } 42 | 43 | static inline __m256i cx_simd_i64_load(const int64_t *ptr) 44 | { 45 | return _mm256_loadu_si256((__m256i *)ptr); 46 | } 47 | 48 | static inline int cx_simd_i64_mask(__m256i vec) 49 | { 50 | return _mm256_movemask_pd((__m256d)vec); 51 | } 52 | 53 | static inline int cx_simd_i64_eq(__m256i a, __m256i b) 54 | { 55 | return cx_simd_i64_mask(_mm256_cmpeq_epi64(a, b)); 56 | } 57 | 58 | static inline int cx_simd_i64_lt(__m256i a, __m256i b) 59 | { 60 | return cx_simd_i64_mask(_mm256_cmpgt_epi64(a, b)); 61 | } 62 | 63 | static inline int cx_simd_i64_gt(__m256i a, __m256i b) 64 | { 65 | return cx_simd_i64_mask(_mm256_cmpgt_epi64(b, a)); 66 | } 67 | 68 | static inline __m256 cx_simd_flt_set(float value) 69 | { 70 | return _mm256_set1_ps(value); 71 | } 72 | 73 | static inline __m256 cx_simd_flt_load(const float *ptr) 74 | { 75 | return _mm256_loadu_ps(ptr); 76 | } 77 | 78 | static inline int cx_simd_flt_mask(__m256 vec) 79 | { 80 | return _mm256_movemask_ps(vec); 81 | } 82 | 83 | static inline int cx_simd_flt_eq(__m256 a, __m256 b) 84 | { 85 | return cx_simd_flt_mask(_mm256_cmp_ps(a, b, _CMP_EQ_OQ)); 86 | } 87 | 88 | static inline int cx_simd_flt_lt(__m256 a, __m256 b) 89 | { 90 | return cx_simd_flt_mask(_mm256_cmp_ps(b, a, _CMP_LT_OQ)); 91 | } 92 | 93 | static inline int cx_simd_flt_gt(__m256 a, __m256 b) 94 | { 95 | return cx_simd_flt_mask(_mm256_cmp_ps(b, a, _CMP_GT_OQ)); 96 | } 97 | 98 | static inline __m256d cx_simd_dbl_set(double value) 99 | { 100 | return _mm256_set1_pd(value); 101 | } 102 | 103 | static inline __m256d cx_simd_dbl_load(const double *ptr) 104 | { 105 | return _mm256_loadu_pd(ptr); 106 | } 107 | 108 | static inline int cx_simd_dbl_mask(__m256d vec) 109 | { 110 | return _mm256_movemask_pd(vec); 111 | } 112 | 113 | static inline int cx_simd_dbl_eq(__m256d a, __m256d b) 114 | { 115 | return cx_simd_dbl_mask(_mm256_cmp_pd(a, b, _CMP_EQ_OQ)); 116 | } 117 | 118 | static inline int cx_simd_dbl_lt(__m256d a, __m256d b) 119 | { 120 | return cx_simd_dbl_mask(_mm256_cmp_pd(b, a, _CMP_LT_OQ)); 121 | } 122 | 123 | static inline int cx_simd_dbl_gt(__m256d a, __m256d b) 124 | { 125 | return cx_simd_dbl_mask(_mm256_cmp_pd(b, a, _CMP_GT_OQ)); 126 | } 127 | -------------------------------------------------------------------------------- /lib/avx512.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | typedef __m512i cx_i32_vec_t; 4 | typedef __m512i cx_i64_vec_t; 5 | typedef __m512 cx_flt_vec_t; 6 | typedef __m512d cx_dbl_vec_t; 7 | 8 | static inline __m512i cx_simd_i32_set(int32_t value) 9 | { 10 | return _mm512_set1_epi32(value); 11 | } 12 | 13 | static inline __m512i cx_simd_i32_load(const int32_t *ptr) 14 | { 15 | return _mm512_loadu_si512((const void *)ptr); 16 | } 17 | 18 | static inline int cx_simd_i32_eq(__m512i a, __m512i b) 19 | { 20 | return (int)_mm512_cmpeq_epi32_mask(a, b); 21 | } 22 | 23 | static inline int cx_simd_i32_lt(__m512i a, __m512i b) 24 | { 25 | return (int)_mm512_cmplt_epi32_mask(a, b); 26 | } 27 | 28 | static inline int cx_simd_i32_gt(__m512i a, __m512i b) 29 | { 30 | return (int)_mm512_cmpgt_epi32_mask(b, a); 31 | } 32 | 33 | static inline __m512i cx_simd_i64_set(int64_t value) 34 | { 35 | return _mm512_set1_epi64(value); 36 | } 37 | 38 | static inline __m512i cx_simd_i64_load(const int64_t *ptr) 39 | { 40 | return _mm512_loadu_si512((const void *)ptr); 41 | } 42 | 43 | static inline int cx_simd_i64_eq(__m512i a, __m512i b) 44 | { 45 | return (int)_mm512_cmpeq_epi64_mask(a, b); 46 | } 47 | 48 | static inline int cx_simd_i64_lt(__m512i a, __m512i b) 49 | { 50 | return (int)_mm512_cmplt_epi64_mask(a, b); 51 | } 52 | 53 | static inline int cx_simd_i64_gt(__m512i a, __m512i b) 54 | { 55 | return (int)_mm512_cmpgt_epi64_mask(b, a); 56 | } 57 | 58 | static inline __m512 cx_simd_flt_set(float value) 59 | { 60 | return _mm512_set1_ps(value); 61 | } 62 | 63 | static inline __m512 cx_simd_flt_load(const float *ptr) 64 | { 65 | return _mm512_loadu_ps(ptr); 66 | } 67 | 68 | static inline int cx_simd_flt_eq(__m512 a, __m512 b) 69 | { 70 | return (int)_mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); 71 | } 72 | 73 | static inline int cx_simd_flt_lt(__m512 a, __m512 b) 74 | { 75 | return (int)_mm512_cmp_ps_mask(b, a, _CMP_LT_OQ); 76 | } 77 | 78 | static inline int cx_simd_flt_gt(__m512 a, __m512 b) 79 | { 80 | return (int)_mm512_cmp_ps_mask(b, a, _CMP_GT_OQ); 81 | } 82 | 83 | static inline __m512d cx_simd_dbl_set(double value) 84 | { 85 | return _mm512_set1_pd(value); 86 | } 87 | 88 | static inline __m512d cx_simd_dbl_load(const double *ptr) 89 | { 90 | return _mm512_loadu_pd(ptr); 91 | } 92 | 93 | static inline int cx_simd_dbl_eq(__m512d a, __m512d b) 94 | { 95 | return (int)_mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); 96 | } 97 | 98 | static inline int cx_simd_dbl_lt(__m512d a, __m512d b) 99 | { 100 | return (int)_mm512_cmp_pd_mask(b, a, _CMP_LT_OQ); 101 | } 102 | 103 | static inline int cx_simd_dbl_gt(__m512d a, __m512d b) 104 | { 105 | return (int)_mm512_cmp_pd_mask(b, a, _CMP_GT_OQ); 106 | } 107 | -------------------------------------------------------------------------------- /lib/column.c: -------------------------------------------------------------------------------- 1 | #define _BSD_SOURCE 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "column.h" 9 | 10 | // when SSE4.2 optimizations are enabled, we make sure there are 11 | // at least 16 initialized bytes after each column value 12 | #if CX_SSE42 13 | #include 14 | #define CX_COLUMN_OVER_ALLOC 16 15 | #endif 16 | 17 | static const size_t cx_column_initial_size = 64; 18 | 19 | struct cx_column { 20 | union { 21 | void *mutable; 22 | const void *mmapped; 23 | } buffer; 24 | size_t count; 25 | size_t offset; 26 | size_t size; 27 | enum cx_column_type type; 28 | enum cx_encoding_type encoding; 29 | bool mmapped; 30 | }; 31 | 32 | struct cx_column_cursor { 33 | const struct cx_column *column; 34 | const void *start; 35 | const void *end; 36 | const void *position; 37 | cx_value_t buffer[CX_BATCH_SIZE]; 38 | }; 39 | 40 | static struct cx_column *cx_column_new_size(enum cx_column_type type, 41 | enum cx_encoding_type encoding, 42 | size_t size, size_t count) 43 | { 44 | struct cx_column *column = calloc(1, sizeof(*column)); 45 | if (!column) 46 | return NULL; 47 | if (size) { 48 | #ifdef CX_COLUMN_OVER_ALLOC 49 | size += CX_COLUMN_OVER_ALLOC; 50 | #endif 51 | column->buffer.mutable = malloc(size); 52 | if (!column->buffer.mutable) 53 | goto error; 54 | #ifdef CX_COLUMN_OVER_ALLOC 55 | memset(column->buffer.mutable, 0, size); 56 | #endif 57 | column->size = size; 58 | } 59 | column->type = type; 60 | column->encoding = encoding; 61 | column->count = count; 62 | return column; 63 | error: 64 | free(column); 65 | return NULL; 66 | } 67 | 68 | struct cx_column *cx_column_new(enum cx_column_type type, 69 | enum cx_encoding_type encoding) 70 | { 71 | return cx_column_new_size(type, encoding, cx_column_initial_size, 0); 72 | } 73 | 74 | struct cx_column *cx_column_new_mmapped(enum cx_column_type type, 75 | enum cx_encoding_type encoding, 76 | const void *ptr, size_t size, 77 | size_t count) 78 | { 79 | struct cx_column *column = cx_column_new_size(type, encoding, 0, count); 80 | if (!column) 81 | return NULL; 82 | column->offset = size; 83 | column->size = size; 84 | column->mmapped = true; 85 | column->buffer.mmapped = ptr; 86 | return column; 87 | } 88 | 89 | struct cx_column *cx_column_new_compressed(enum cx_column_type type, 90 | enum cx_encoding_type encoding, 91 | void **ptr, size_t size, 92 | size_t count) 93 | { 94 | if (!size) 95 | return NULL; 96 | struct cx_column *column = cx_column_new_size(type, encoding, size, count); 97 | if (!column) 98 | return NULL; 99 | column->offset = size; 100 | *ptr = column->buffer.mutable; 101 | return column; 102 | } 103 | 104 | void cx_column_free(struct cx_column *column) 105 | { 106 | if (!column->mmapped) 107 | free(column->buffer.mutable); 108 | free(column); 109 | } 110 | 111 | static const void *cx_column_head(const struct cx_column *column) 112 | { 113 | if (column->mmapped) 114 | return column->buffer.mmapped; 115 | else 116 | return column->buffer.mutable; 117 | } 118 | 119 | static const void *cx_column_offset(const struct cx_column *column, 120 | size_t offset) 121 | { 122 | const void *ptr = cx_column_head(column); 123 | return (const void *)((uintptr_t)ptr + offset); 124 | } 125 | 126 | static const void *cx_column_tail(const struct cx_column *column) 127 | { 128 | return cx_column_offset(column, column->offset); 129 | } 130 | 131 | const void *cx_column_export(const struct cx_column *column, size_t *size) 132 | { 133 | *size = column->offset; 134 | return cx_column_head(column); 135 | } 136 | 137 | enum cx_column_type cx_column_type(const struct cx_column *column) 138 | { 139 | return column->type; 140 | } 141 | 142 | enum cx_encoding_type cx_column_encoding(const struct cx_column *column) 143 | { 144 | return column->encoding; 145 | } 146 | 147 | size_t cx_column_count(const struct cx_column *column) 148 | { 149 | return column->count; 150 | } 151 | 152 | __attribute__((noinline)) static bool cx_column_resize(struct cx_column *column, 153 | size_t alloc_size) 154 | { 155 | size_t size = column->size; 156 | size_t required_size = column->offset + alloc_size; 157 | #ifdef CX_COLUMN_OVER_ALLOC 158 | required_size += CX_COLUMN_OVER_ALLOC; 159 | #endif 160 | while (size < required_size) { 161 | assert(size * 2 > size); 162 | size *= 2; 163 | } 164 | void *buffer = realloc(column->buffer.mutable, size); 165 | if (!buffer) 166 | return false; 167 | #ifdef CX_COLUMN_OVER_ALLOC 168 | memset((void *)((uintptr_t)buffer + column->offset), 0, 169 | size - column->offset); 170 | #endif 171 | column->buffer.mutable = buffer; 172 | column->size = size; 173 | return true; 174 | } 175 | 176 | static bool cx_column_put(struct cx_column *column, enum cx_column_type type, 177 | const void *value, size_t size) 178 | { 179 | if (column->mmapped || column->type != type || !value) 180 | return false; 181 | if (column->offset + size > column->size) 182 | if (!cx_column_resize(column, size)) 183 | return false; 184 | void *slot = (void *)cx_column_tail(column); 185 | memcpy(slot, value, size); 186 | column->count++; 187 | column->offset += size; 188 | return true; 189 | } 190 | 191 | bool cx_column_put_bit(struct cx_column *column, bool value) 192 | { 193 | if (column->count % 64 == 0) { 194 | uint64_t bitset = value != 0; 195 | return cx_column_put(column, CX_COLUMN_BIT, &bitset, sizeof(uint64_t)); 196 | } else if (column->type != CX_COLUMN_BIT || column->mmapped) 197 | return false; 198 | if (value) { 199 | uint64_t *bitset = (uint64_t *)cx_column_offset( 200 | column, column->offset - sizeof(uint64_t)); 201 | *bitset |= (uint64_t)1 << column->count; 202 | } 203 | column->count++; 204 | return true; 205 | } 206 | 207 | bool cx_column_put_i32(struct cx_column *column, int32_t value) 208 | { 209 | return cx_column_put(column, CX_COLUMN_I32, &value, sizeof(int32_t)); 210 | } 211 | 212 | bool cx_column_put_i64(struct cx_column *column, int64_t value) 213 | { 214 | return cx_column_put(column, CX_COLUMN_I64, &value, sizeof(int64_t)); 215 | } 216 | 217 | bool cx_column_put_flt(struct cx_column *column, float value) 218 | { 219 | return cx_column_put(column, CX_COLUMN_FLT, &value, sizeof(float)); 220 | } 221 | 222 | bool cx_column_put_dbl(struct cx_column *column, double value) 223 | { 224 | return cx_column_put(column, CX_COLUMN_DBL, &value, sizeof(double)); 225 | } 226 | 227 | bool cx_column_put_str(struct cx_column *column, const char *value) 228 | { 229 | return cx_column_put(column, CX_COLUMN_STR, value, strlen(value) + 1); 230 | } 231 | 232 | bool cx_column_put_unit(struct cx_column *column) 233 | { 234 | switch (column->type) { 235 | case CX_COLUMN_BIT: 236 | return cx_column_put_bit(column, false); 237 | case CX_COLUMN_I32: 238 | return cx_column_put_i32(column, 0); 239 | case CX_COLUMN_I64: 240 | return cx_column_put_i64(column, 0); 241 | case CX_COLUMN_FLT: 242 | return cx_column_put_flt(column, 0.0); 243 | case CX_COLUMN_DBL: 244 | return cx_column_put_dbl(column, 0.0); 245 | case CX_COLUMN_STR: 246 | return cx_column_put_str(column, ""); 247 | } 248 | return false; 249 | } 250 | 251 | static bool cx_column_madvise(const struct cx_column *column, int advice) 252 | { 253 | if (!column->mmapped || !column->size) 254 | return true; 255 | size_t page_size = getpagesize(); 256 | uintptr_t addr = (uintptr_t)column->buffer.mmapped; 257 | size_t offset = addr % page_size; 258 | return !madvise((void *)(addr - offset), (column->size + offset), advice); 259 | } 260 | 261 | struct cx_column_cursor *cx_column_cursor_new(const struct cx_column *column) 262 | { 263 | struct cx_column_cursor *cursor = malloc(sizeof(*cursor)); 264 | if (!cursor) 265 | return NULL; 266 | cursor->column = column; 267 | cursor->start = cx_column_head(column); 268 | cursor->end = cx_column_tail(column); 269 | if (!cx_column_madvise(column, MADV_SEQUENTIAL)) 270 | goto error; 271 | cx_column_cursor_rewind(cursor); 272 | return cursor; 273 | error: 274 | free(cursor); 275 | return NULL; 276 | } 277 | 278 | void cx_column_cursor_free(struct cx_column_cursor *cursor) 279 | { 280 | free(cursor); 281 | } 282 | 283 | void cx_column_cursor_rewind(struct cx_column_cursor *cursor) 284 | { 285 | cursor->position = cursor->start; 286 | } 287 | 288 | bool cx_column_cursor_valid(const struct cx_column_cursor *cursor) 289 | { 290 | return cursor->position < cursor->end; 291 | } 292 | 293 | static void cx_column_cursor_advance(struct cx_column_cursor *cursor, 294 | size_t size) 295 | { 296 | cursor->position = (void *)((uintptr_t)cursor->position + size); 297 | assert(cursor->position <= cursor->end); 298 | } 299 | 300 | static size_t cx_column_cursor_skip(struct cx_column_cursor *cursor, 301 | enum cx_column_type type, size_t size, 302 | size_t count) 303 | { 304 | assert(cursor->column->type == type); 305 | size_t remaining = 306 | ((uintptr_t)cursor->end - (uintptr_t)cursor->position) / size; 307 | if (remaining < count) 308 | count = remaining; 309 | cx_column_cursor_advance(cursor, size * count); 310 | return count; 311 | } 312 | 313 | size_t cx_column_cursor_skip_bit(struct cx_column_cursor *cursor, size_t count) 314 | { 315 | assert(count % 64 == 0); 316 | size_t skipped = cx_column_cursor_skip(cursor, CX_COLUMN_BIT, 317 | sizeof(uint64_t), count / 64); 318 | skipped *= 64; 319 | if (!cx_column_cursor_valid(cursor)) { 320 | size_t trailing_bits = cursor->column->count % 64; 321 | if (trailing_bits) 322 | skipped -= 64 - trailing_bits; 323 | } 324 | return skipped; 325 | } 326 | 327 | size_t cx_column_cursor_skip_i32(struct cx_column_cursor *cursor, size_t count) 328 | { 329 | return cx_column_cursor_skip(cursor, CX_COLUMN_I32, sizeof(int32_t), count); 330 | } 331 | 332 | size_t cx_column_cursor_skip_i64(struct cx_column_cursor *cursor, size_t count) 333 | { 334 | return cx_column_cursor_skip(cursor, CX_COLUMN_I64, sizeof(int64_t), count); 335 | } 336 | 337 | size_t cx_column_cursor_skip_flt(struct cx_column_cursor *cursor, size_t count) 338 | { 339 | return cx_column_cursor_skip(cursor, CX_COLUMN_FLT, sizeof(float), count); 340 | } 341 | 342 | size_t cx_column_cursor_skip_dbl(struct cx_column_cursor *cursor, size_t count) 343 | { 344 | return cx_column_cursor_skip(cursor, CX_COLUMN_DBL, sizeof(double), count); 345 | } 346 | 347 | static inline size_t cx_strlen(const char *string) 348 | { 349 | #if CX_SSE42 350 | __m128i v_null = _mm_set1_epi8(0); 351 | size_t length = 0; 352 | for (;;) { 353 | __m128i v_str = _mm_loadu_si128((__m128i *)(string + length)); 354 | int result = _mm_cmpistri( 355 | v_str, v_null, 356 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK); 357 | length += result; 358 | if (result < 16) 359 | break; 360 | } 361 | return length; 362 | #else 363 | return strlen(string); 364 | #endif 365 | } 366 | 367 | size_t cx_column_cursor_skip_str(struct cx_column_cursor *cursor, size_t count) 368 | { 369 | assert(cursor->column->type == CX_COLUMN_STR); 370 | size_t skipped = 0; 371 | // TODO: vectorise this 372 | for (; skipped < count && cx_column_cursor_valid(cursor); skipped++) 373 | cx_column_cursor_advance(cursor, cx_strlen(cursor->position) + 1); 374 | return skipped; 375 | } 376 | 377 | const uint64_t *cx_column_cursor_next_batch_bit(struct cx_column_cursor *cursor, 378 | size_t *available) 379 | { 380 | const uint64_t *values = cursor->position; 381 | *available = cx_column_cursor_skip_bit(cursor, CX_BATCH_SIZE); 382 | return values; 383 | } 384 | 385 | const int32_t *cx_column_cursor_next_batch_i32(struct cx_column_cursor *cursor, 386 | size_t *available) 387 | { 388 | const int32_t *values = cursor->position; 389 | *available = cx_column_cursor_skip_i32(cursor, CX_BATCH_SIZE); 390 | return values; 391 | } 392 | 393 | const int64_t *cx_column_cursor_next_batch_i64(struct cx_column_cursor *cursor, 394 | size_t *available) 395 | { 396 | const int64_t *values = cursor->position; 397 | *available = cx_column_cursor_skip_i64(cursor, CX_BATCH_SIZE); 398 | return values; 399 | } 400 | 401 | const float *cx_column_cursor_next_batch_flt(struct cx_column_cursor *cursor, 402 | size_t *available) 403 | { 404 | const float *values = cursor->position; 405 | *available = cx_column_cursor_skip_flt(cursor, CX_BATCH_SIZE); 406 | return values; 407 | } 408 | 409 | const double *cx_column_cursor_next_batch_dbl(struct cx_column_cursor *cursor, 410 | size_t *available) 411 | { 412 | const double *values = cursor->position; 413 | *available = cx_column_cursor_skip_dbl(cursor, CX_BATCH_SIZE); 414 | return values; 415 | } 416 | 417 | const struct cx_string *cx_column_cursor_next_batch_str( 418 | struct cx_column_cursor *cursor, size_t *available) 419 | { 420 | assert(cursor->column->type == CX_COLUMN_STR); 421 | size_t i = 0; 422 | struct cx_string *strings = (struct cx_string *)cursor->buffer; 423 | for (; i < CX_BATCH_SIZE && cx_column_cursor_valid(cursor); i++) { 424 | strings[i].ptr = cursor->position; 425 | strings[i].len = cx_strlen(cursor->position); 426 | cx_column_cursor_advance(cursor, strings[i].len + 1); 427 | } 428 | *available = i; 429 | return strings; 430 | } 431 | -------------------------------------------------------------------------------- /lib/column.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_COLUMN_H_ 2 | #define CX_COLUMN_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "common.h" 9 | 10 | #define CX_BATCH_SIZE 64 11 | 12 | struct cx_column; 13 | 14 | struct cx_column_cursor; 15 | 16 | struct cx_column *cx_column_new(enum cx_column_type, enum cx_encoding_type); 17 | 18 | struct cx_column *cx_column_new_mmapped(enum cx_column_type, 19 | enum cx_encoding_type, const void *ptr, 20 | size_t size, size_t count); 21 | 22 | struct cx_column *cx_column_new_compressed(enum cx_column_type, 23 | enum cx_encoding_type, void **buffer, 24 | size_t size, size_t count); 25 | 26 | void cx_column_free(struct cx_column *); 27 | 28 | const void *cx_column_export(const struct cx_column *, size_t *); 29 | 30 | enum cx_column_type cx_column_type(const struct cx_column *); 31 | 32 | enum cx_encoding_type cx_column_encoding(const struct cx_column *); 33 | 34 | size_t cx_column_count(const struct cx_column *column); 35 | 36 | bool cx_column_put_bit(struct cx_column *, bool); 37 | bool cx_column_put_i32(struct cx_column *, int32_t); 38 | bool cx_column_put_i64(struct cx_column *, int64_t); 39 | bool cx_column_put_flt(struct cx_column *, float); 40 | bool cx_column_put_dbl(struct cx_column *, double); 41 | bool cx_column_put_str(struct cx_column *, const char *); 42 | 43 | bool cx_column_put_unit(struct cx_column *); 44 | 45 | struct cx_column_cursor *cx_column_cursor_new(const struct cx_column *); 46 | 47 | void cx_column_cursor_free(struct cx_column_cursor *); 48 | 49 | void cx_column_cursor_rewind(struct cx_column_cursor *); 50 | 51 | bool cx_column_cursor_valid(const struct cx_column_cursor *); 52 | 53 | const uint64_t *cx_column_cursor_next_batch_bit(struct cx_column_cursor *, 54 | size_t *); 55 | const int32_t *cx_column_cursor_next_batch_i32(struct cx_column_cursor *, 56 | size_t *); 57 | const int64_t *cx_column_cursor_next_batch_i64(struct cx_column_cursor *, 58 | size_t *); 59 | const float *cx_column_cursor_next_batch_flt(struct cx_column_cursor *, 60 | size_t *); 61 | const double *cx_column_cursor_next_batch_dbl(struct cx_column_cursor *, 62 | size_t *); 63 | const struct cx_string *cx_column_cursor_next_batch_str( 64 | struct cx_column_cursor *, size_t *); 65 | 66 | size_t cx_column_cursor_skip_bit(struct cx_column_cursor *, size_t); 67 | size_t cx_column_cursor_skip_i32(struct cx_column_cursor *, size_t); 68 | size_t cx_column_cursor_skip_i64(struct cx_column_cursor *, size_t); 69 | size_t cx_column_cursor_skip_flt(struct cx_column_cursor *, size_t); 70 | size_t cx_column_cursor_skip_dbl(struct cx_column_cursor *, size_t); 71 | size_t cx_column_cursor_skip_str(struct cx_column_cursor *, size_t); 72 | 73 | #ifdef __cplusplus 74 | } 75 | #endif 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /lib/common.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_TYPES_H_ 2 | #define CX_TYPES_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #define CX_EXPORT __attribute__((visibility("default"))) 13 | 14 | enum cx_column_type { 15 | CX_COLUMN_BIT, 16 | CX_COLUMN_I32, 17 | CX_COLUMN_I64, 18 | CX_COLUMN_FLT, 19 | CX_COLUMN_DBL, 20 | CX_COLUMN_STR 21 | }; 22 | 23 | enum cx_encoding_type { CX_ENCODING_NONE }; 24 | 25 | enum cx_compression_type { 26 | CX_COMPRESSION_NONE, 27 | CX_COMPRESSION_LZ4, 28 | CX_COMPRESSION_LZ4HC, 29 | CX_COMPRESSION_ZSTD 30 | }; 31 | 32 | struct cx_string { 33 | const char *ptr; 34 | size_t len; 35 | }; 36 | 37 | typedef union { 38 | bool bit; 39 | int32_t i32; 40 | int64_t i64; 41 | float flt; 42 | double dbl; 43 | struct cx_string str; 44 | } cx_value_t; 45 | 46 | enum cx_str_location { 47 | CX_STR_LOCATION_START, 48 | CX_STR_LOCATION_END, 49 | CX_STR_LOCATION_ANY 50 | }; 51 | 52 | #ifdef __cplusplus 53 | } 54 | #endif 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /lib/compress.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "compress.h" 8 | 9 | static void *cx_compress_lz4(int level, const void *src, size_t src_size, 10 | size_t *dest_size) 11 | { 12 | if (src_size > INT_MAX) 13 | return NULL; 14 | int max_size = LZ4_compressBound(src_size); 15 | if (!max_size) 16 | return NULL; 17 | void *compressed = malloc(max_size); 18 | if (!compressed) 19 | return NULL; 20 | int result = LZ4_compress_fast(src, compressed, src_size, max_size, level); 21 | if (!result) 22 | goto error; 23 | *dest_size = result; 24 | return compressed; 25 | error: 26 | free(compressed); 27 | return NULL; 28 | } 29 | 30 | static bool cx_decompress_lz4(const void *src, size_t src_size, void *dest, 31 | size_t dest_size) 32 | { 33 | int result = LZ4_decompress_safe(src, dest, src_size, dest_size); 34 | return result && (size_t)result == dest_size; 35 | } 36 | 37 | static void *cx_compress_lz4hc(int level, const void *src, size_t src_size, 38 | size_t *dest_size) 39 | { 40 | if (src_size > INT_MAX) 41 | return NULL; 42 | int max_size = LZ4_compressBound(src_size); 43 | if (!max_size) 44 | return NULL; 45 | void *compressed = malloc(max_size); 46 | if (!compressed) 47 | return NULL; 48 | int result = LZ4_compress_HC(src, compressed, src_size, max_size, level); 49 | if (!result) 50 | goto error; 51 | *dest_size = result; 52 | return compressed; 53 | error: 54 | free(compressed); 55 | return NULL; 56 | } 57 | 58 | static void *cx_compress_zstd(int level, const void *src, size_t src_size, 59 | size_t *dest_size) 60 | { 61 | int max_size = ZSTD_compressBound(src_size); 62 | if (!max_size) 63 | return NULL; 64 | void *compressed = malloc(max_size); 65 | if (!compressed) 66 | return NULL; 67 | size_t result = ZSTD_compress(compressed, max_size, src, src_size, level); 68 | if (ZSTD_isError(result)) 69 | goto error; 70 | *dest_size = result; 71 | return compressed; 72 | error: 73 | free(compressed); 74 | return NULL; 75 | } 76 | 77 | static bool cx_decompress_zstd(const void *src, size_t src_size, void *dest, 78 | size_t dest_size) 79 | { 80 | size_t result = ZSTD_decompress(dest, dest_size, src, src_size); 81 | return !ZSTD_isError(result) && result == dest_size; 82 | } 83 | 84 | void *cx_compress(enum cx_compression_type type, int level, const void *src, 85 | size_t src_size, size_t *dest_size) 86 | { 87 | if (type == CX_COMPRESSION_LZ4) 88 | return cx_compress_lz4(level, src, src_size, dest_size); 89 | else if (type == CX_COMPRESSION_LZ4HC) 90 | return cx_compress_lz4hc(level, src, src_size, dest_size); 91 | else if (type == CX_COMPRESSION_ZSTD) 92 | return cx_compress_zstd(level, src, src_size, dest_size); 93 | else 94 | return NULL; 95 | } 96 | 97 | bool cx_decompress(enum cx_compression_type type, const void *src, 98 | size_t src_size, void *dest, size_t dest_size) 99 | { 100 | if (type == CX_COMPRESSION_LZ4 || type == CX_COMPRESSION_LZ4HC) 101 | return cx_decompress_lz4(src, src_size, dest, dest_size); 102 | else if (type == CX_COMPRESSION_ZSTD) 103 | return cx_decompress_zstd(src, src_size, dest, dest_size); 104 | else 105 | return false; 106 | } 107 | -------------------------------------------------------------------------------- /lib/compress.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_COMPRESS_H_ 2 | #define CX_COMPRESS_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "common.h" 9 | 10 | void *cx_compress(enum cx_compression_type, int level, const void *src, 11 | size_t src_size, size_t *dest_size); 12 | 13 | bool cx_decompress(enum cx_compression_type, const void *src, size_t src_size, 14 | void *dest, size_t dest_size); 15 | 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/file.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_FILE_H_ 2 | #define CX_FILE_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "index.h" 9 | 10 | #define CX_FILE_MAGIC 0x7863040378630201LLU 11 | 12 | #define CX_FILE_VERSION 1 13 | 14 | #define CX_WRITE_ALIGN 8 15 | 16 | struct cx_header { 17 | uint64_t magic; 18 | uint32_t version; 19 | uint32_t __padding; 20 | }; 21 | 22 | struct cx_footer { 23 | uint64_t strings_offset; 24 | uint64_t strings_size; 25 | int32_t metadata; 26 | uint32_t __padding; 27 | uint32_t row_group_count; 28 | uint32_t column_count; 29 | uint64_t row_count; 30 | uint32_t size; 31 | uint32_t version; 32 | uint64_t magic; 33 | }; 34 | 35 | struct cx_column_descriptor { 36 | uint32_t name; 37 | uint32_t type; 38 | uint32_t encoding; 39 | uint32_t compression; 40 | int32_t compression_level; 41 | uint32_t __padding; 42 | }; 43 | 44 | struct cx_row_group_header { 45 | uint64_t size; 46 | uint64_t offset; 47 | }; 48 | 49 | struct cx_column_header { 50 | uint64_t offset; 51 | uint64_t size; 52 | uint64_t decompressed_size; 53 | uint32_t compression; 54 | uint32_t encoding; 55 | struct cx_index index; 56 | }; 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /lib/index.c: -------------------------------------------------------------------------------- 1 | #define __STDC_LIMIT_MACROS 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "index.h" 8 | 9 | static void cx_index_update_bit(struct cx_index *, struct cx_column_cursor *); 10 | static void cx_index_update_i32(struct cx_index *, struct cx_column_cursor *); 11 | static void cx_index_update_i64(struct cx_index *, struct cx_column_cursor *); 12 | static void cx_index_update_flt(struct cx_index *, struct cx_column_cursor *); 13 | static void cx_index_update_dbl(struct cx_index *, struct cx_column_cursor *); 14 | static void cx_index_update_str(struct cx_index *, struct cx_column_cursor *); 15 | 16 | struct cx_index *cx_index_new(const struct cx_column *column) 17 | { 18 | struct cx_index *index = calloc(1, sizeof(*index)); 19 | if (!index) 20 | return NULL; 21 | struct cx_column_cursor *cursor = cx_column_cursor_new(column); 22 | if (!cursor) 23 | goto error; 24 | switch (cx_column_type(column)) { 25 | case CX_COLUMN_BIT: 26 | index->min.bit = true; 27 | cx_index_update_bit(index, cursor); 28 | break; 29 | case CX_COLUMN_I32: 30 | index->min.i32 = INT32_MAX; 31 | cx_index_update_i32(index, cursor); 32 | break; 33 | case CX_COLUMN_I64: 34 | index->min.i64 = INT64_MAX; 35 | cx_index_update_i64(index, cursor); 36 | break; 37 | case CX_COLUMN_FLT: 38 | index->min.flt = FLT_MAX; 39 | cx_index_update_flt(index, cursor); 40 | break; 41 | case CX_COLUMN_DBL: 42 | index->min.dbl = DBL_MAX; 43 | cx_index_update_dbl(index, cursor); 44 | break; 45 | case CX_COLUMN_STR: 46 | index->min.len = UINT64_MAX; 47 | cx_index_update_str(index, cursor); 48 | break; 49 | } 50 | cx_column_cursor_free(cursor); 51 | return index; 52 | error: 53 | free(index); 54 | return NULL; 55 | } 56 | 57 | void cx_index_free(struct cx_index *index) 58 | { 59 | free(index); 60 | } 61 | 62 | static void cx_index_update_bit(struct cx_index *index, 63 | struct cx_column_cursor *cursor) 64 | { 65 | while (cx_column_cursor_valid(cursor)) { 66 | size_t count; 67 | const uint64_t *bitset = 68 | cx_column_cursor_next_batch_bit(cursor, &count); 69 | assert(count); 70 | index->count += count; 71 | for (size_t i = 0; i < count; i++) { 72 | bool value = *bitset & ((uint64_t)1 << i); 73 | index->min.bit = index->min.bit && value; 74 | index->max.bit = index->max.bit || value; 75 | } 76 | } 77 | } 78 | 79 | static void cx_index_update_i32(struct cx_index *index, 80 | struct cx_column_cursor *cursor) 81 | { 82 | while (cx_column_cursor_valid(cursor)) { 83 | size_t count; 84 | const int32_t *values = cx_column_cursor_next_batch_i32(cursor, &count); 85 | assert(count); 86 | index->count += count; 87 | for (size_t i = 0; i < count; i++) { 88 | int32_t value = values[i]; 89 | if (value > index->max.i32) 90 | index->max.i32 = value; 91 | if (value < index->min.i32) 92 | index->min.i32 = value; 93 | } 94 | } 95 | } 96 | 97 | static void cx_index_update_i64(struct cx_index *index, 98 | struct cx_column_cursor *cursor) 99 | { 100 | while (cx_column_cursor_valid(cursor)) { 101 | size_t count; 102 | const int64_t *values = cx_column_cursor_next_batch_i64(cursor, &count); 103 | assert(count); 104 | index->count += count; 105 | for (size_t i = 0; i < count; i++) { 106 | int64_t value = values[i]; 107 | if (value > index->max.i64) 108 | index->max.i64 = value; 109 | if (value < index->min.i64) 110 | index->min.i64 = value; 111 | } 112 | } 113 | } 114 | 115 | static void cx_index_update_flt(struct cx_index *index, 116 | struct cx_column_cursor *cursor) 117 | { 118 | while (cx_column_cursor_valid(cursor)) { 119 | size_t count; 120 | const float *values = cx_column_cursor_next_batch_flt(cursor, &count); 121 | assert(count); 122 | index->count += count; 123 | for (size_t i = 0; i < count; i++) { 124 | float value = values[i]; 125 | if (value > index->max.flt) 126 | index->max.flt = value; 127 | if (value < index->min.flt) 128 | index->min.flt = value; 129 | } 130 | } 131 | } 132 | 133 | static void cx_index_update_dbl(struct cx_index *index, 134 | struct cx_column_cursor *cursor) 135 | { 136 | while (cx_column_cursor_valid(cursor)) { 137 | size_t count; 138 | const double *values = cx_column_cursor_next_batch_dbl(cursor, &count); 139 | assert(count); 140 | index->count += count; 141 | for (size_t i = 0; i < count; i++) { 142 | double value = values[i]; 143 | if (value > index->max.dbl) 144 | index->max.dbl = value; 145 | if (value < index->min.dbl) 146 | index->min.dbl = value; 147 | } 148 | } 149 | } 150 | 151 | static void cx_index_update_str(struct cx_index *index, 152 | struct cx_column_cursor *cursor) 153 | { 154 | while (cx_column_cursor_valid(cursor)) { 155 | size_t count; 156 | const struct cx_string *values = 157 | cx_column_cursor_next_batch_str(cursor, &count); 158 | assert(count); 159 | index->count += count; 160 | for (size_t i = 0; i < count; i++) { 161 | uint64_t value = values[i].len; 162 | if (value > index->max.len) 163 | index->max.len = value; 164 | if (value < index->min.len) 165 | index->min.len = value; 166 | } 167 | } 168 | } 169 | 170 | enum cx_index_match cx_index_match_bit_eq(const struct cx_index *index, 171 | bool value) 172 | { 173 | if (index->min.bit && index->max.bit) 174 | return value ? CX_INDEX_MATCH_ALL : CX_INDEX_MATCH_NONE; 175 | if (!index->min.bit && !index->max.bit) 176 | return value ? CX_INDEX_MATCH_NONE : CX_INDEX_MATCH_ALL; 177 | return CX_INDEX_MATCH_UNKNOWN; 178 | } 179 | 180 | enum cx_index_match cx_index_match_i32_eq(const struct cx_index *index, 181 | int32_t value) 182 | { 183 | if (index->min.i32 > value || index->max.i32 < value) 184 | return CX_INDEX_MATCH_NONE; 185 | if (index->min.i32 == value && index->max.i32 == value) 186 | return CX_INDEX_MATCH_ALL; 187 | return CX_INDEX_MATCH_UNKNOWN; 188 | } 189 | 190 | enum cx_index_match cx_index_match_i32_lt(const struct cx_index *index, 191 | int32_t value) 192 | { 193 | if (index->min.i32 >= value) 194 | return CX_INDEX_MATCH_NONE; 195 | if (index->max.i32 < value) 196 | return CX_INDEX_MATCH_ALL; 197 | return CX_INDEX_MATCH_UNKNOWN; 198 | } 199 | 200 | enum cx_index_match cx_index_match_i32_gt(const struct cx_index *index, 201 | int32_t value) 202 | { 203 | if (index->min.i32 > value) 204 | return CX_INDEX_MATCH_ALL; 205 | if (index->max.i32 <= value) 206 | return CX_INDEX_MATCH_NONE; 207 | return CX_INDEX_MATCH_UNKNOWN; 208 | } 209 | 210 | enum cx_index_match cx_index_match_i64_eq(const struct cx_index *index, 211 | int64_t value) 212 | { 213 | if (index->min.i64 > value || index->max.i64 < value) 214 | return CX_INDEX_MATCH_NONE; 215 | if (index->min.i64 == value && index->max.i64 == value) 216 | return CX_INDEX_MATCH_ALL; 217 | return CX_INDEX_MATCH_UNKNOWN; 218 | } 219 | 220 | enum cx_index_match cx_index_match_i64_lt(const struct cx_index *index, 221 | int64_t value) 222 | { 223 | if (index->min.i64 >= value) 224 | return CX_INDEX_MATCH_NONE; 225 | if (index->max.i64 < value) 226 | return CX_INDEX_MATCH_ALL; 227 | return CX_INDEX_MATCH_UNKNOWN; 228 | } 229 | 230 | enum cx_index_match cx_index_match_i64_gt(const struct cx_index *index, 231 | int64_t value) 232 | { 233 | if (index->min.i64 > value) 234 | return CX_INDEX_MATCH_ALL; 235 | if (index->max.i64 <= value) 236 | return CX_INDEX_MATCH_NONE; 237 | return CX_INDEX_MATCH_UNKNOWN; 238 | } 239 | 240 | enum cx_index_match cx_index_match_flt_eq(const struct cx_index *index, 241 | float value) 242 | { 243 | if (index->min.flt > value || index->max.flt < value) 244 | return CX_INDEX_MATCH_NONE; 245 | if (index->min.flt == value && index->max.flt == value) 246 | return CX_INDEX_MATCH_ALL; 247 | return CX_INDEX_MATCH_UNKNOWN; 248 | } 249 | 250 | enum cx_index_match cx_index_match_flt_lt(const struct cx_index *index, 251 | float value) 252 | { 253 | if (index->min.flt >= value) 254 | return CX_INDEX_MATCH_NONE; 255 | if (index->max.flt < value) 256 | return CX_INDEX_MATCH_ALL; 257 | return CX_INDEX_MATCH_UNKNOWN; 258 | } 259 | 260 | enum cx_index_match cx_index_match_flt_gt(const struct cx_index *index, 261 | float value) 262 | { 263 | if (index->min.flt > value) 264 | return CX_INDEX_MATCH_ALL; 265 | if (index->max.flt <= value) 266 | return CX_INDEX_MATCH_NONE; 267 | return CX_INDEX_MATCH_UNKNOWN; 268 | } 269 | 270 | enum cx_index_match cx_index_match_dbl_eq(const struct cx_index *index, 271 | double value) 272 | { 273 | if (index->min.dbl > value || index->max.dbl < value) 274 | return CX_INDEX_MATCH_NONE; 275 | if (index->min.dbl == value && index->max.dbl == value) 276 | return CX_INDEX_MATCH_ALL; 277 | return CX_INDEX_MATCH_UNKNOWN; 278 | } 279 | 280 | enum cx_index_match cx_index_match_dbl_lt(const struct cx_index *index, 281 | double value) 282 | { 283 | if (index->min.dbl >= value) 284 | return CX_INDEX_MATCH_NONE; 285 | if (index->max.dbl < value) 286 | return CX_INDEX_MATCH_ALL; 287 | return CX_INDEX_MATCH_UNKNOWN; 288 | } 289 | 290 | enum cx_index_match cx_index_match_dbl_gt(const struct cx_index *index, 291 | double value) 292 | { 293 | if (index->min.dbl > value) 294 | return CX_INDEX_MATCH_ALL; 295 | if (index->max.dbl <= value) 296 | return CX_INDEX_MATCH_NONE; 297 | return CX_INDEX_MATCH_UNKNOWN; 298 | } 299 | 300 | enum cx_index_match cx_index_match_str_eq(const struct cx_index *index, 301 | const struct cx_string *string) 302 | { 303 | if (index->min.len > string->len || index->max.len < string->len) 304 | return CX_INDEX_MATCH_NONE; 305 | return CX_INDEX_MATCH_UNKNOWN; 306 | } 307 | 308 | enum cx_index_match cx_index_match_str_contains(const struct cx_index *index, 309 | const struct cx_string *string) 310 | { 311 | if (index->max.len < string->len) 312 | return CX_INDEX_MATCH_NONE; 313 | return CX_INDEX_MATCH_UNKNOWN; 314 | } 315 | -------------------------------------------------------------------------------- /lib/index.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_INDEX_H_ 2 | #define CX_INDEX_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "column.h" 9 | 10 | typedef union { 11 | bool bit; 12 | int32_t i32; 13 | int64_t i64; 14 | float flt; 15 | double dbl; 16 | uint64_t len; 17 | } cx_index_value_t; 18 | 19 | struct cx_index { 20 | uint64_t count; 21 | cx_index_value_t min; 22 | cx_index_value_t max; 23 | }; 24 | 25 | struct cx_index *cx_index_new(const struct cx_column *); 26 | 27 | void cx_index_free(struct cx_index *); 28 | 29 | enum cx_index_match { 30 | CX_INDEX_MATCH_NONE = -1, 31 | CX_INDEX_MATCH_UNKNOWN = 0, 32 | CX_INDEX_MATCH_ALL = 1 33 | }; 34 | 35 | enum cx_index_match cx_index_match_bit_eq(const struct cx_index *, bool); 36 | 37 | enum cx_index_match cx_index_match_i32_eq(const struct cx_index *, int32_t); 38 | enum cx_index_match cx_index_match_i32_lt(const struct cx_index *, int32_t); 39 | enum cx_index_match cx_index_match_i32_gt(const struct cx_index *, int32_t); 40 | 41 | enum cx_index_match cx_index_match_i64_eq(const struct cx_index *, int64_t); 42 | enum cx_index_match cx_index_match_i64_lt(const struct cx_index *, int64_t); 43 | enum cx_index_match cx_index_match_i64_gt(const struct cx_index *, int64_t); 44 | 45 | enum cx_index_match cx_index_match_flt_eq(const struct cx_index *, float); 46 | enum cx_index_match cx_index_match_flt_lt(const struct cx_index *, float); 47 | enum cx_index_match cx_index_match_flt_gt(const struct cx_index *, float); 48 | 49 | enum cx_index_match cx_index_match_dbl_eq(const struct cx_index *, double); 50 | enum cx_index_match cx_index_match_dbl_lt(const struct cx_index *, double); 51 | enum cx_index_match cx_index_match_dbl_gt(const struct cx_index *, double); 52 | 53 | enum cx_index_match cx_index_match_str_eq(const struct cx_index *, 54 | const struct cx_string *); 55 | enum cx_index_match cx_index_match_str_contains(const struct cx_index *, 56 | const struct cx_string *); 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | 62 | #endif 63 | -------------------------------------------------------------------------------- /lib/java.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_JAVA_H_ 2 | #define CX_JAVA_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | CX_EXPORT jlong Java_com_columnix_jni_Reader_create(JNIEnv *, jobject, jstring); 11 | CX_EXPORT jlong Java_com_columnix_jni_Reader_createMatching(JNIEnv *, jobject, 12 | jstring, jlong); 13 | CX_EXPORT void Java_com_columnix_jni_Reader_free(JNIEnv *, jobject, jlong); 14 | CX_EXPORT jstring Java_com_columnix_jni_Reader_getMetadata(JNIEnv *, jobject, 15 | jlong); 16 | CX_EXPORT jint Java_com_columnix_jni_Reader_columnCount(JNIEnv *, jobject, 17 | jlong); 18 | CX_EXPORT jlong Java_com_columnix_jni_Reader_rowCount(JNIEnv *, jobject, jlong); 19 | CX_EXPORT void Java_com_columnix_jni_Reader_rewind(JNIEnv *, jobject, jlong); 20 | CX_EXPORT jboolean Java_com_columnix_jni_Reader_next(JNIEnv *, jobject, jlong); 21 | CX_EXPORT jstring Java_com_columnix_jni_Reader_columnName(JNIEnv *, jobject, 22 | jlong, jint); 23 | CX_EXPORT jint Java_com_columnix_jni_Reader_columnType(JNIEnv *, jobject, jlong, 24 | jint); 25 | CX_EXPORT jint Java_com_columnix_jni_Reader_columnEncoding(JNIEnv *, jobject, 26 | jlong, jint); 27 | CX_EXPORT jint Java_com_columnix_jni_Reader_columnCompression(JNIEnv *, jobject, 28 | jlong, jint); 29 | CX_EXPORT jboolean Java_com_columnix_jni_Reader_isNull(JNIEnv *, jobject, jlong, 30 | jint); 31 | CX_EXPORT jboolean Java_com_columnix_jni_Reader_getBoolean(JNIEnv *, jobject, 32 | jlong, jint); 33 | CX_EXPORT jint Java_com_columnix_jni_Reader_getInt(JNIEnv *, jobject, jlong, 34 | jint); 35 | CX_EXPORT jlong Java_com_columnix_jni_Reader_getLong(JNIEnv *, jobject, jlong, 36 | jint); 37 | CX_EXPORT jfloat Java_com_columnix_jni_Reader_getFloat(JNIEnv *, jobject, jlong, 38 | jint); 39 | CX_EXPORT jdouble Java_com_columnix_jni_Reader_getDouble(JNIEnv *, jobject, 40 | jlong, jint); 41 | CX_EXPORT jstring Java_com_columnix_jni_Reader_getString(JNIEnv *, jobject, 42 | jlong, jint); 43 | CX_EXPORT jbyteArray Java_com_columnix_jni_Reader_getStringBytes(JNIEnv *, 44 | jobject, jlong, 45 | jint); 46 | 47 | CX_EXPORT jlong Java_com_columnix_jni_Writer_create(JNIEnv *, jobject, jstring, 48 | jlong); 49 | CX_EXPORT void Java_com_columnix_jni_Writer_free(JNIEnv *, jobject, jlong); 50 | CX_EXPORT void Java_com_columnix_jni_Writer_setMetadata(JNIEnv *, jobject, 51 | jlong, jstring); 52 | CX_EXPORT void Java_com_columnix_jni_Writer_finish(JNIEnv *, jobject, jlong, 53 | jboolean); 54 | CX_EXPORT void Java_com_columnix_jni_Writer_addColumn(JNIEnv *, jobject, jlong, 55 | jstring, jint, jint, jint, 56 | jint); 57 | CX_EXPORT void Java_com_columnix_jni_Writer_putNull(JNIEnv *, jobject, jlong, 58 | jint); 59 | CX_EXPORT void Java_com_columnix_jni_Writer_putBoolean(JNIEnv *, jobject, jlong, 60 | jint, jboolean); 61 | CX_EXPORT void Java_com_columnix_jni_Writer_putInt(JNIEnv *, jobject, jlong, 62 | jint, jint); 63 | CX_EXPORT void Java_com_columnix_jni_Writer_putLong(JNIEnv *, jobject, jlong, 64 | jint, jlong); 65 | CX_EXPORT void Java_com_columnix_jni_Writer_putFloat(JNIEnv *, jobject, jlong, 66 | jint, jfloat); 67 | CX_EXPORT void Java_com_columnix_jni_Writer_putDouble(JNIEnv *, jobject, jlong, 68 | jint, jdouble); 69 | CX_EXPORT void Java_com_columnix_jni_Writer_putString(JNIEnv *, jobject, jlong, 70 | jint, jstring); 71 | 72 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_negate(JNIEnv *, jobject, 73 | jlong); 74 | CX_EXPORT void Java_com_columnix_jni_Predicate_free(JNIEnv *, jobject, jlong); 75 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_and(JNIEnv *, jobject, 76 | jlongArray); 77 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_or(JNIEnv *, jobject, 78 | jlongArray); 79 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_isNull(JNIEnv *, jobject, jint); 80 | 81 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_booleanEquals(JNIEnv *, jobject, 82 | jint, jboolean); 83 | 84 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_intEquals(JNIEnv *, jobject, 85 | jint, jint); 86 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_intGreaterThan(JNIEnv *, 87 | jobject, jint, 88 | jint); 89 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_intLessThan(JNIEnv *, jobject, 90 | jint, jint); 91 | 92 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_longEquals(JNIEnv *, jobject, 93 | jint, jlong); 94 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_longGreaterThan(JNIEnv *, 95 | jobject, jint, 96 | jlong); 97 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_longLessThan(JNIEnv *, jobject, 98 | jint, jlong); 99 | 100 | CX_EXPORT jfloat Java_com_columnix_jni_Predicate_floatEquals(JNIEnv *, jobject, 101 | jint, jfloat); 102 | CX_EXPORT jfloat Java_com_columnix_jni_Predicate_floatGreaterThan(JNIEnv *, 103 | jobject, jint, 104 | jfloat); 105 | CX_EXPORT jfloat Java_com_columnix_jni_Predicate_floatLessThan(JNIEnv *, 106 | jobject, jint, 107 | jfloat); 108 | 109 | CX_EXPORT jdouble Java_com_columnix_jni_Predicate_doubleEquals(JNIEnv *, 110 | jobject, jint, 111 | jdouble); 112 | CX_EXPORT jdouble Java_com_columnix_jni_Predicate_doubleGreaterThan(JNIEnv *, 113 | jobject, 114 | jint, 115 | jdouble); 116 | CX_EXPORT jdouble Java_com_columnix_jni_Predicate_doubleLessThan(JNIEnv *, 117 | jobject, jint, 118 | jdouble); 119 | 120 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_stringEquals(JNIEnv *, jobject, 121 | jint, jstring, 122 | jboolean); 123 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_stringGreaterThan(JNIEnv *, 124 | jobject, jint, 125 | jstring, 126 | jboolean); 127 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_stringLessThan(JNIEnv *, 128 | jobject, jint, 129 | jstring, 130 | jboolean); 131 | CX_EXPORT jlong Java_com_columnix_jni_Predicate_stringContains(JNIEnv *, 132 | jobject, jint, 133 | jstring, jint, 134 | jboolean); 135 | 136 | #ifdef __cplusplus 137 | } 138 | #endif 139 | 140 | #endif 141 | -------------------------------------------------------------------------------- /lib/libcolumnix.pc.in: -------------------------------------------------------------------------------- 1 | prefix=@PREFIX@ 2 | libdir=@LIBDIR@ 3 | includedir=@INCLUDEDIR@ 4 | 5 | Name: columnix 6 | Description: columnar storage 7 | URL: http://www.columnix.com/ 8 | Version: @VERSION@ 9 | Libs: -L${libdir} -lcolumnix 10 | Cflags: -I${includedir} 11 | -------------------------------------------------------------------------------- /lib/match.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | #include 3 | #include 4 | 5 | #include "match.h" 6 | 7 | #ifdef CX_AVX512 8 | #include "avx512.h" 9 | #define CX_SIMD_WIDTH 64 10 | #elif defined(CX_AVX2) 11 | #include "avx2.h" 12 | #define CX_SIMD_WIDTH 32 13 | #elif defined(CX_AVX) 14 | #include "avx.h" 15 | #define CX_SIMD_WIDTH 16 16 | #endif 17 | 18 | #ifdef CX_SSE42 19 | #include 20 | #endif 21 | 22 | #define CX_NAIVE_MATCH_DEFINITION(name, type, match, op) \ 23 | static uint64_t cx_match_##name##_##match##_naive( \ 24 | size_t size, const type batch[], type cmp) \ 25 | { \ 26 | assert(size <= 64); \ 27 | uint64_t mask = 0; \ 28 | for (size_t i = 0; i < size; i++) \ 29 | if (batch[i] op cmp) \ 30 | mask |= (uint64_t)1 << i; \ 31 | return mask; \ 32 | } 33 | 34 | #ifdef CX_SIMD_WIDTH 35 | 36 | #define CX_SIMD_MATCH_DEFINITION(width, name, type, match) \ 37 | static inline uint64_t cx_match_##name##_##match##_simd( \ 38 | size_t size, const type batch[], type cmp) \ 39 | { \ 40 | cx_##name##_vec_t v_cmp = cx_simd_##name##_set(cmp); \ 41 | int partial_mask[64 * sizeof(type) / width]; \ 42 | for (size_t i = 0; i < 64 * sizeof(type) / width; i++) { \ 43 | cx_##name##_vec_t chunk = \ 44 | cx_simd_##name##_load(&batch[i * (width / sizeof(type))]); \ 45 | partial_mask[i] = cx_simd_##name##_##match(v_cmp, chunk); \ 46 | } \ 47 | uint64_t mask = 0; \ 48 | for (size_t i = 0; i < 64 * sizeof(type) / width; i++) \ 49 | mask |= \ 50 | ((uint64_t)partial_mask[i] << (i * (width / sizeof(type)))); \ 51 | return mask; \ 52 | } 53 | 54 | #define CX_SIMD_MATCH_SET(name, type) \ 55 | CX_SIMD_MATCH_DEFINITION(CX_SIMD_WIDTH, name, type, eq) \ 56 | CX_SIMD_MATCH_DEFINITION(CX_SIMD_WIDTH, name, type, lt) \ 57 | CX_SIMD_MATCH_DEFINITION(CX_SIMD_WIDTH, name, type, gt) 58 | 59 | CX_SIMD_MATCH_SET(i32, int32_t) 60 | CX_SIMD_MATCH_SET(i64, int64_t) 61 | CX_SIMD_MATCH_SET(flt, float) 62 | CX_SIMD_MATCH_SET(dbl, double) 63 | 64 | #define CX_MATCH_DEFINITION(name, type, match) \ 65 | uint64_t cx_match_##name##_##match(size_t size, const type batch[], \ 66 | type cmp) \ 67 | { \ 68 | if (size == 64) \ 69 | return cx_match_##name##_##match##_simd(size, batch, cmp); \ 70 | return cx_match_##name##_##match##_naive(size, batch, cmp); \ 71 | } 72 | 73 | #else 74 | 75 | #define CX_MATCH_DEFINITION(name, type, match) \ 76 | uint64_t cx_match_##name##_##match(size_t size, const type batch[], \ 77 | type cmp) \ 78 | { \ 79 | return cx_match_##name##_##match##_naive(size, batch, cmp); \ 80 | } 81 | 82 | #endif // simd 83 | 84 | #define CX_MATCH_TYPE(name, type) \ 85 | CX_NAIVE_MATCH_DEFINITION(name, type, eq, ==) \ 86 | CX_MATCH_DEFINITION(name, type, eq) \ 87 | CX_NAIVE_MATCH_DEFINITION(name, type, lt, <) \ 88 | CX_MATCH_DEFINITION(name, type, lt) \ 89 | CX_NAIVE_MATCH_DEFINITION(name, type, gt, >) \ 90 | CX_MATCH_DEFINITION(name, type, gt) 91 | 92 | CX_MATCH_TYPE(i32, int32_t) 93 | CX_MATCH_TYPE(i64, int64_t) 94 | CX_MATCH_TYPE(flt, float) 95 | CX_MATCH_TYPE(dbl, double) 96 | 97 | static inline bool cx_str_eq(const struct cx_string *str, 98 | const struct cx_string *cmp) 99 | { 100 | if (str->len != cmp->len) 101 | return false; 102 | #if CX_SSE42 103 | if (str->len < 16) { 104 | __m128i v_str = _mm_loadu_si128((__m128i *)str->ptr); 105 | __m128i v_cmp = _mm_loadu_si128((__m128i *)cmp->ptr); 106 | return _mm_cmpistro( 107 | v_cmp, v_str, 108 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK); 109 | } 110 | #endif 111 | return !memcmp(str->ptr, cmp->ptr, str->len); 112 | } 113 | 114 | static inline bool cx_str_eq_ci(const struct cx_string *str, 115 | const struct cx_string *cmp) 116 | { 117 | return str->len == cmp->len && !strcasecmp(str->ptr, cmp->ptr); 118 | } 119 | 120 | static inline bool cx_str_contains_any(const struct cx_string *str, 121 | const struct cx_string *cmp) 122 | { 123 | if (str->len < cmp->len) 124 | return false; 125 | #if CX_SSE42 126 | if (str->len < 16 && cmp->len < 16) { 127 | __m128i v_str = _mm_loadu_si128((__m128i *)str->ptr); 128 | __m128i v_cmp = _mm_loadu_si128((__m128i *)cmp->ptr); 129 | return _mm_cmpistrc( 130 | v_cmp, v_str, 131 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK); 132 | } 133 | #endif 134 | return !!strstr(str->ptr, cmp->ptr); 135 | } 136 | 137 | static inline bool cx_str_contains_any_ci(const struct cx_string *str, 138 | const struct cx_string *cmp) 139 | { 140 | return str->len >= cmp->len && !!strcasestr(str->ptr, cmp->ptr); 141 | } 142 | 143 | static inline bool cx_str_contains_start(const struct cx_string *str, 144 | const struct cx_string *cmp) 145 | { 146 | if (str->len < cmp->len) 147 | return false; 148 | #if CX_SSE42 149 | if (cmp->len < 16) { 150 | __m128i v_str = _mm_loadu_si128((__m128i *)str->ptr); 151 | __m128i v_cmp = _mm_loadu_si128((__m128i *)cmp->ptr); 152 | return _mm_cmpistro( 153 | v_cmp, v_str, 154 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK); 155 | } 156 | #endif 157 | return !memcmp(str->ptr, cmp->ptr, cmp->len); 158 | } 159 | 160 | static inline bool cx_str_contains_start_ci(const struct cx_string *str, 161 | const struct cx_string *cmp) 162 | { 163 | return str->len >= cmp->len && !strncasecmp(str->ptr, cmp->ptr, cmp->len); 164 | } 165 | 166 | static inline bool cx_str_contains_end(const struct cx_string *str, 167 | const struct cx_string *cmp) 168 | { 169 | if (str->len < cmp->len) 170 | return false; 171 | #if CX_SSE42 172 | if (cmp->len < 16) { 173 | __m128i v_str = 174 | _mm_loadu_si128((__m128i *)(str->ptr + str->len - cmp->len)); 175 | __m128i v_cmp = _mm_loadu_si128((__m128i *)cmp->ptr); 176 | return _mm_cmpistro( 177 | v_cmp, v_str, 178 | _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK); 179 | } 180 | #endif 181 | return !memcmp(str->ptr + str->len - cmp->len, cmp->ptr, cmp->len); 182 | } 183 | 184 | static inline bool cx_str_contains_end_ci(const struct cx_string *str, 185 | const struct cx_string *cmp) 186 | { 187 | return str->len >= cmp->len && 188 | !strncasecmp(str->ptr + str->len - cmp->len, cmp->ptr, cmp->len); 189 | } 190 | 191 | static inline bool cx_str_lt(const struct cx_string *str, 192 | const struct cx_string *cmp) 193 | { 194 | return strcmp(str->ptr, cmp->ptr) < 0; 195 | } 196 | 197 | static inline bool cx_str_gt(const struct cx_string *str, 198 | const struct cx_string *cmp) 199 | { 200 | return strcmp(str->ptr, cmp->ptr) > 0; 201 | } 202 | 203 | static inline bool cx_str_lt_ci(const struct cx_string *str, 204 | const struct cx_string *cmp) 205 | { 206 | return strcasecmp(str->ptr, cmp->ptr) < 0; 207 | } 208 | 209 | static inline bool cx_str_gt_ci(const struct cx_string *str, 210 | const struct cx_string *cmp) 211 | { 212 | return strcasecmp(str->ptr, cmp->ptr) > 0; 213 | } 214 | 215 | #define CX_STR_MATCH(name, prefix) \ 216 | prefix uint64_t cx_match_str_##name( \ 217 | size_t size, const struct cx_string strings[], \ 218 | const struct cx_string *cmp, bool case_sensitive) \ 219 | { \ 220 | assert(size <= 64); \ 221 | uint64_t mask = 0; \ 222 | if (case_sensitive) { \ 223 | for (size_t i = 0; i < size; i++) \ 224 | if (cx_str_##name(&strings[i], cmp)) \ 225 | mask |= (uint64_t)1 << i; \ 226 | } else { \ 227 | for (size_t i = 0; i < size; i++) \ 228 | if (cx_str_##name##_ci(&strings[i], cmp)) \ 229 | mask |= (uint64_t)1 << i; \ 230 | } \ 231 | return mask; \ 232 | } 233 | 234 | CX_STR_MATCH(eq, ) 235 | CX_STR_MATCH(lt, ) 236 | CX_STR_MATCH(gt, ) 237 | 238 | CX_STR_MATCH(contains_any, static) 239 | CX_STR_MATCH(contains_start, static) 240 | CX_STR_MATCH(contains_end, static) 241 | 242 | uint64_t cx_match_str_contains(size_t size, const struct cx_string strings[], 243 | const struct cx_string *cmp, bool case_sensitive, 244 | enum cx_str_location location) 245 | { 246 | uint64_t matches = 0; 247 | switch (location) { 248 | case CX_STR_LOCATION_START: 249 | matches = 250 | cx_match_str_contains_start(size, strings, cmp, case_sensitive); 251 | break; 252 | case CX_STR_LOCATION_END: 253 | matches = 254 | cx_match_str_contains_end(size, strings, cmp, case_sensitive); 255 | break; 256 | case CX_STR_LOCATION_ANY: 257 | matches = 258 | cx_match_str_contains_any(size, strings, cmp, case_sensitive); 259 | break; 260 | } 261 | return matches; 262 | } 263 | -------------------------------------------------------------------------------- /lib/match.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_MATCH_H_ 2 | #define CX_MATCH_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "column.h" 9 | 10 | uint64_t cx_match_i32_eq(size_t, const int32_t[], int32_t); 11 | uint64_t cx_match_i32_lt(size_t, const int32_t[], int32_t); 12 | uint64_t cx_match_i32_gt(size_t, const int32_t[], int32_t); 13 | 14 | uint64_t cx_match_i64_eq(size_t, const int64_t[], int64_t); 15 | uint64_t cx_match_i64_lt(size_t, const int64_t[], int64_t); 16 | uint64_t cx_match_i64_gt(size_t, const int64_t[], int64_t); 17 | 18 | uint64_t cx_match_flt_eq(size_t, const float[], float); 19 | uint64_t cx_match_flt_lt(size_t, const float[], float); 20 | uint64_t cx_match_flt_gt(size_t, const float[], float); 21 | 22 | uint64_t cx_match_dbl_eq(size_t, const double[], double); 23 | uint64_t cx_match_dbl_lt(size_t, const double[], double); 24 | uint64_t cx_match_dbl_gt(size_t, const double[], double); 25 | 26 | uint64_t cx_match_str_eq(size_t, const struct cx_string[], 27 | const struct cx_string *, bool); 28 | uint64_t cx_match_str_lt(size_t, const struct cx_string[], 29 | const struct cx_string *, bool); 30 | uint64_t cx_match_str_gt(size_t, const struct cx_string[], 31 | const struct cx_string *, bool); 32 | uint64_t cx_match_str_contains(size_t, const struct cx_string[], 33 | const struct cx_string *, bool, 34 | enum cx_str_location); 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /lib/predicate.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_PREDICATE_H_ 2 | #define CX_PREDICATE_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | #include "row_group.h" 11 | 12 | struct cx_predicate; 13 | 14 | CX_EXPORT void cx_predicate_free(struct cx_predicate *); 15 | 16 | CX_EXPORT struct cx_predicate *cx_predicate_new_true(void); 17 | 18 | CX_EXPORT struct cx_predicate *cx_predicate_new_null(size_t); 19 | 20 | CX_EXPORT struct cx_predicate *cx_predicate_negate(struct cx_predicate *); 21 | 22 | CX_EXPORT struct cx_predicate *cx_predicate_new_bit_eq(size_t, bool); 23 | 24 | CX_EXPORT struct cx_predicate *cx_predicate_new_i32_eq(size_t, int32_t); 25 | CX_EXPORT struct cx_predicate *cx_predicate_new_i32_lt(size_t, int32_t); 26 | CX_EXPORT struct cx_predicate *cx_predicate_new_i32_gt(size_t, int32_t); 27 | 28 | CX_EXPORT struct cx_predicate *cx_predicate_new_i64_eq(size_t, int64_t); 29 | CX_EXPORT struct cx_predicate *cx_predicate_new_i64_lt(size_t, int64_t); 30 | CX_EXPORT struct cx_predicate *cx_predicate_new_i64_gt(size_t, int64_t); 31 | 32 | CX_EXPORT struct cx_predicate *cx_predicate_new_flt_eq(size_t, float); 33 | CX_EXPORT struct cx_predicate *cx_predicate_new_flt_lt(size_t, float); 34 | CX_EXPORT struct cx_predicate *cx_predicate_new_flt_gt(size_t, float); 35 | 36 | CX_EXPORT struct cx_predicate *cx_predicate_new_dbl_eq(size_t, double); 37 | CX_EXPORT struct cx_predicate *cx_predicate_new_dbl_lt(size_t, double); 38 | CX_EXPORT struct cx_predicate *cx_predicate_new_dbl_gt(size_t, double); 39 | 40 | CX_EXPORT struct cx_predicate *cx_predicate_new_str_eq(size_t, const char *, 41 | bool); 42 | CX_EXPORT struct cx_predicate *cx_predicate_new_str_lt(size_t, const char *, 43 | bool); 44 | CX_EXPORT struct cx_predicate *cx_predicate_new_str_gt(size_t, const char *, 45 | bool); 46 | CX_EXPORT struct cx_predicate *cx_predicate_new_str_contains( 47 | size_t, const char *, bool, enum cx_str_location); 48 | 49 | CX_EXPORT struct cx_predicate *cx_predicate_new_and(size_t, ...); 50 | CX_EXPORT struct cx_predicate *cx_predicate_new_vand(size_t, va_list); 51 | CX_EXPORT struct cx_predicate *cx_predicate_new_aand(size_t, 52 | struct cx_predicate **); 53 | CX_EXPORT struct cx_predicate *cx_predicate_new_or(size_t, ...); 54 | CX_EXPORT struct cx_predicate *cx_predicate_new_vor(size_t, va_list); 55 | CX_EXPORT struct cx_predicate *cx_predicate_new_aor(size_t, 56 | struct cx_predicate **); 57 | 58 | bool cx_predicate_valid(const struct cx_predicate *, 59 | const struct cx_row_group *); 60 | 61 | void cx_predicate_optimize(struct cx_predicate *, const struct cx_row_group *); 62 | 63 | const struct cx_predicate **cx_predicate_operands(const struct cx_predicate *, 64 | size_t *); 65 | 66 | enum cx_index_match cx_index_match_indexes(const struct cx_predicate *, 67 | const struct cx_row_group *); 68 | 69 | bool cx_index_match_rows(const struct cx_predicate *predicate, 70 | const struct cx_row_group *row_group, 71 | struct cx_row_group_cursor *cursor, uint64_t *matches, 72 | size_t *count); 73 | 74 | typedef enum cx_index_match (*cx_index_match_index_t)(enum cx_column_type, 75 | const struct cx_index *, 76 | void *data); 77 | 78 | typedef bool (*cx_index_match_rows_t)(enum cx_column_type, size_t count, 79 | const void *values, uint64_t *matches, 80 | void *data); 81 | 82 | struct cx_predicate *cx_predicate_new_custom(size_t column, enum cx_column_type, 83 | cx_index_match_rows_t, 84 | cx_index_match_index_t, int cost, 85 | void *data); 86 | 87 | #ifdef __cplusplus 88 | } 89 | #endif 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /lib/reader.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_READER_H_ 2 | #define CX_READER_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | #include "row.h" 11 | 12 | struct cx_reader; 13 | 14 | CX_EXPORT struct cx_reader *cx_reader_new(const char *); 15 | 16 | CX_EXPORT struct cx_reader *cx_reader_new_matching(const char *, 17 | struct cx_predicate *); 18 | 19 | CX_EXPORT bool cx_reader_metadata(const struct cx_reader *, const char **); 20 | 21 | CX_EXPORT void cx_reader_free(struct cx_reader *); 22 | 23 | CX_EXPORT void cx_reader_rewind(struct cx_reader *); 24 | 25 | CX_EXPORT bool cx_reader_next(struct cx_reader *); 26 | 27 | CX_EXPORT bool cx_reader_error(const struct cx_reader *); 28 | 29 | CX_EXPORT size_t cx_reader_column_count(const struct cx_reader *); 30 | 31 | CX_EXPORT size_t cx_reader_row_count(struct cx_reader *); 32 | 33 | CX_EXPORT bool cx_reader_query(struct cx_reader *, int thread_count, void *data, 34 | void (*iter)(struct cx_row_cursor *, 35 | pthread_mutex_t *, void *)); 36 | 37 | CX_EXPORT const char *cx_reader_column_name(const struct cx_reader *, size_t); 38 | 39 | CX_EXPORT enum cx_column_type cx_reader_column_type(const struct cx_reader *, 40 | size_t); 41 | 42 | CX_EXPORT enum cx_encoding_type cx_reader_column_encoding( 43 | const struct cx_reader *, size_t); 44 | 45 | CX_EXPORT enum cx_compression_type cx_reader_column_compression( 46 | const struct cx_reader *, size_t, int *level); 47 | 48 | CX_EXPORT bool cx_reader_get_null(const struct cx_reader *, size_t column_index, 49 | bool *value); 50 | CX_EXPORT bool cx_reader_get_bit(const struct cx_reader *, size_t column_index, 51 | bool *value); 52 | CX_EXPORT bool cx_reader_get_i32(const struct cx_reader *, size_t column_index, 53 | int32_t *value); 54 | CX_EXPORT bool cx_reader_get_i64(const struct cx_reader *, size_t column_index, 55 | int64_t *value); 56 | CX_EXPORT bool cx_reader_get_flt(const struct cx_reader *, size_t column_index, 57 | float *value); 58 | CX_EXPORT bool cx_reader_get_dbl(const struct cx_reader *, size_t column_index, 59 | double *value); 60 | CX_EXPORT bool cx_reader_get_str(const struct cx_reader *, size_t column_index, 61 | struct cx_string *value); 62 | 63 | struct cx_row_group_reader; 64 | 65 | struct cx_row_group_reader *cx_row_group_reader_new(const char *); 66 | 67 | bool cx_row_group_reader_metadata(const struct cx_row_group_reader *, 68 | const char **); 69 | 70 | size_t cx_row_group_reader_column_count(const struct cx_row_group_reader *); 71 | 72 | size_t cx_row_group_reader_row_count(const struct cx_row_group_reader *); 73 | 74 | size_t cx_row_group_reader_row_group_count(const struct cx_row_group_reader *); 75 | 76 | const char *cx_row_group_reader_column_name(const struct cx_row_group_reader *, 77 | size_t); 78 | 79 | enum cx_column_type cx_row_group_reader_column_type( 80 | const struct cx_row_group_reader *, size_t); 81 | 82 | enum cx_encoding_type cx_row_group_reader_column_encoding( 83 | const struct cx_row_group_reader *, size_t); 84 | 85 | enum cx_compression_type cx_row_group_reader_column_compression( 86 | const struct cx_row_group_reader *, size_t, int *level); 87 | 88 | struct cx_row_group *cx_row_group_reader_get(const struct cx_row_group_reader *, 89 | size_t); 90 | 91 | void cx_row_group_reader_free(struct cx_row_group_reader *reader); 92 | 93 | #ifdef __cplusplus 94 | } 95 | #endif 96 | 97 | #endif 98 | -------------------------------------------------------------------------------- /lib/row.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "row.h" 5 | 6 | struct cx_row_cursor { 7 | struct cx_row_group *row_group; 8 | struct cx_row_group_cursor *cursor; 9 | const struct cx_predicate *predicate; 10 | size_t column_count; 11 | uint64_t row_mask; 12 | size_t position; 13 | enum cx_index_match index_match; 14 | bool implicit_predicate; 15 | bool error; 16 | }; 17 | 18 | struct cx_row_cursor *cx_row_cursor_new(struct cx_row_group *row_group, 19 | const struct cx_predicate *predicate) 20 | { 21 | struct cx_row_cursor *cursor = calloc(1, sizeof(*cursor)); 22 | if (!cursor) 23 | return NULL; 24 | cursor->row_group = row_group; 25 | cursor->cursor = cx_row_group_cursor_new(row_group); 26 | if (!cursor->cursor) 27 | goto error; 28 | cursor->predicate = predicate; 29 | cursor->index_match = 30 | cx_index_match_indexes(cursor->predicate, cursor->row_group); 31 | cx_row_cursor_rewind(cursor); 32 | return cursor; 33 | error: 34 | free(cursor); 35 | return NULL; 36 | } 37 | 38 | void cx_row_cursor_free(struct cx_row_cursor *cursor) 39 | { 40 | cx_row_group_cursor_free(cursor->cursor); 41 | free(cursor); 42 | } 43 | 44 | void cx_row_cursor_rewind(struct cx_row_cursor *cursor) 45 | { 46 | cursor->row_mask = 0; 47 | cursor->position = 64; 48 | cx_row_group_cursor_rewind(cursor->cursor); 49 | cursor->error = false; 50 | } 51 | 52 | static uint64_t cx_row_cursor_load_row_mask(struct cx_row_cursor *cursor) 53 | { 54 | uint64_t row_mask = 0; 55 | while (!row_mask && cx_row_group_cursor_next(cursor->cursor)) { 56 | size_t count; 57 | if (cursor->index_match == CX_INDEX_MATCH_ALL) { 58 | count = cx_row_group_cursor_batch_count(cursor->cursor); 59 | row_mask = (uint64_t)-1; 60 | if (count < 64) 61 | row_mask &= ((uint64_t)1 << count) - 1; 62 | } else if (!cx_index_match_rows(cursor->predicate, cursor->row_group, 63 | cursor->cursor, &row_mask, &count)) 64 | goto error; 65 | } 66 | return row_mask; 67 | error: 68 | cursor->error = true; 69 | return 0; 70 | } 71 | 72 | bool cx_row_cursor_next(struct cx_row_cursor *cursor) 73 | { 74 | if (cursor->position < 63) { 75 | uint64_t mask = cursor->row_mask >> (cursor->position + 1); 76 | if (mask) { 77 | cursor->position += __builtin_ctzll(mask) + 1; 78 | return true; 79 | } 80 | } 81 | cursor->row_mask = cx_row_cursor_load_row_mask(cursor); 82 | if (!cursor->row_mask) 83 | return false; 84 | cursor->position = __builtin_ctzll(cursor->row_mask); 85 | return true; 86 | } 87 | 88 | bool cx_row_cursor_error(const struct cx_row_cursor *cursor) 89 | { 90 | return cursor->error; 91 | } 92 | 93 | size_t cx_row_cursor_count(struct cx_row_cursor *cursor) 94 | { 95 | if (cursor->index_match == CX_INDEX_MATCH_NONE) 96 | return 0; 97 | else if (cursor->index_match == CX_INDEX_MATCH_ALL) 98 | return cx_row_group_row_count(cursor->row_group); 99 | cx_row_cursor_rewind(cursor); 100 | size_t count = 0; 101 | for (;;) { 102 | uint64_t row_mask = cx_row_cursor_load_row_mask(cursor); 103 | if (!row_mask) 104 | break; 105 | count += __builtin_popcountll(row_mask); 106 | } 107 | return count; 108 | } 109 | 110 | bool cx_row_cursor_get_null(const struct cx_row_cursor *cursor, 111 | size_t column_index, bool *value) 112 | { 113 | assert(cursor->row_mask); 114 | size_t count; 115 | const uint64_t *nulls = 116 | cx_row_group_cursor_batch_nulls(cursor->cursor, column_index, &count); 117 | uint64_t row_bit = (uint64_t)1 << cursor->position; 118 | if (!count || !nulls) 119 | return false; 120 | *value = *nulls & row_bit; 121 | return true; 122 | } 123 | 124 | bool cx_row_cursor_get_bit(const struct cx_row_cursor *cursor, 125 | size_t column_index, bool *value) 126 | { 127 | assert(cursor->row_mask); 128 | size_t count; 129 | const uint64_t *bitset = 130 | cx_row_group_cursor_batch_bit(cursor->cursor, column_index, &count); 131 | uint64_t row_bit = (uint64_t)1 << cursor->position; 132 | if (!count || !bitset) 133 | return false; 134 | *value = *bitset & row_bit; 135 | return true; 136 | } 137 | 138 | bool cx_row_cursor_get_i32(const struct cx_row_cursor *cursor, 139 | size_t column_index, int32_t *value) 140 | { 141 | assert(cursor->row_mask); 142 | size_t count; 143 | const int32_t *batch = 144 | cx_row_group_cursor_batch_i32(cursor->cursor, column_index, &count); 145 | if (!batch || !count) 146 | return false; 147 | *value = batch[cursor->position]; 148 | return true; 149 | } 150 | 151 | bool cx_row_cursor_get_i64(const struct cx_row_cursor *cursor, 152 | size_t column_index, int64_t *value) 153 | { 154 | assert(cursor->row_mask); 155 | size_t count; 156 | const int64_t *batch = 157 | cx_row_group_cursor_batch_i64(cursor->cursor, column_index, &count); 158 | if (!batch || !count) 159 | return false; 160 | *value = batch[cursor->position]; 161 | return true; 162 | } 163 | 164 | bool cx_row_cursor_get_flt(const struct cx_row_cursor *cursor, 165 | size_t column_index, float *value) 166 | { 167 | assert(cursor->row_mask); 168 | size_t count; 169 | const float *batch = 170 | cx_row_group_cursor_batch_flt(cursor->cursor, column_index, &count); 171 | if (!batch || !count) 172 | return false; 173 | *value = batch[cursor->position]; 174 | return true; 175 | } 176 | 177 | bool cx_row_cursor_get_dbl(const struct cx_row_cursor *cursor, 178 | size_t column_index, double *value) 179 | { 180 | assert(cursor->row_mask); 181 | size_t count; 182 | const double *batch = 183 | cx_row_group_cursor_batch_dbl(cursor->cursor, column_index, &count); 184 | if (!batch || !count) 185 | return false; 186 | *value = batch[cursor->position]; 187 | return true; 188 | } 189 | 190 | bool cx_row_cursor_get_str(const struct cx_row_cursor *cursor, 191 | size_t column_index, struct cx_string *value) 192 | { 193 | assert(cursor->row_mask); 194 | size_t count; 195 | const struct cx_string *batch = 196 | cx_row_group_cursor_batch_str(cursor->cursor, column_index, &count); 197 | if (!batch || !count) 198 | return false; 199 | const struct cx_string *string = &batch[cursor->position]; 200 | value->ptr = string->ptr; 201 | value->len = string->len; 202 | return true; 203 | } 204 | -------------------------------------------------------------------------------- /lib/row.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_ROW_H_ 2 | #define CX_ROW_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "predicate.h" 9 | 10 | struct cx_row_cursor; 11 | 12 | CX_EXPORT struct cx_row_cursor *cx_row_cursor_new(struct cx_row_group *, 13 | const struct cx_predicate *); 14 | 15 | CX_EXPORT void cx_row_cursor_free(struct cx_row_cursor *); 16 | 17 | CX_EXPORT void cx_row_cursor_rewind(struct cx_row_cursor *); 18 | 19 | CX_EXPORT bool cx_row_cursor_next(struct cx_row_cursor *); 20 | 21 | CX_EXPORT bool cx_row_cursor_error(const struct cx_row_cursor *); 22 | 23 | CX_EXPORT size_t cx_row_cursor_count(struct cx_row_cursor *); 24 | 25 | CX_EXPORT bool cx_row_cursor_get_null(const struct cx_row_cursor *, 26 | size_t column_index, bool *value); 27 | CX_EXPORT bool cx_row_cursor_get_bit(const struct cx_row_cursor *, 28 | size_t column_index, bool *value); 29 | CX_EXPORT bool cx_row_cursor_get_i32(const struct cx_row_cursor *, 30 | size_t column_index, int32_t *value); 31 | CX_EXPORT bool cx_row_cursor_get_i64(const struct cx_row_cursor *, 32 | size_t column_index, int64_t *value); 33 | CX_EXPORT bool cx_row_cursor_get_flt(const struct cx_row_cursor *, 34 | size_t column_index, float *value); 35 | CX_EXPORT bool cx_row_cursor_get_dbl(const struct cx_row_cursor *, 36 | size_t column_index, double *value); 37 | CX_EXPORT bool cx_row_cursor_get_str(const struct cx_row_cursor *, 38 | size_t column_index, 39 | struct cx_string *value); 40 | 41 | #ifdef __cplusplus 42 | } 43 | #endif 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /lib/row_group.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "compress.h" 6 | #include "row_group.h" 7 | 8 | static const size_t cx_row_group_column_initial_size = 8; 9 | 10 | struct cx_row_group_physical_column { 11 | struct cx_index *index; 12 | struct cx_column *column; 13 | struct cx_lazy_column lazy_column; 14 | }; 15 | 16 | struct cx_row_group_column { 17 | enum cx_column_type type; 18 | enum cx_encoding_type encoding; 19 | struct cx_row_group_physical_column values; 20 | struct cx_row_group_physical_column nulls; 21 | bool lazy; 22 | }; 23 | 24 | struct cx_row_group { 25 | struct cx_row_group_column *columns; 26 | size_t count; 27 | size_t size; 28 | size_t row_count; 29 | }; 30 | 31 | struct cx_row_group_cursor_physical_column { 32 | struct cx_column_cursor *cursor; 33 | size_t position; 34 | const void *batch; 35 | size_t count; 36 | }; 37 | 38 | struct cx_row_group_cursor_column { 39 | struct cx_row_group_cursor_physical_column values; 40 | struct cx_row_group_cursor_physical_column nulls; 41 | }; 42 | 43 | struct cx_row_group_cursor { 44 | struct cx_row_group *row_group; 45 | size_t column_count; 46 | size_t row_count; 47 | size_t position; 48 | bool initialized; 49 | struct cx_row_group_cursor_column columns[]; 50 | }; 51 | 52 | struct cx_row_group *cx_row_group_new() 53 | { 54 | struct cx_row_group *row_group = malloc(sizeof(*row_group)); 55 | if (!row_group) 56 | return NULL; 57 | row_group->columns = 58 | malloc(cx_row_group_column_initial_size * sizeof(*row_group->columns)); 59 | if (!row_group->columns) 60 | goto error; 61 | row_group->count = 0; 62 | row_group->size = cx_row_group_column_initial_size; 63 | return row_group; 64 | error: 65 | free(row_group); 66 | return NULL; 67 | } 68 | 69 | void cx_row_group_free(struct cx_row_group *row_group) 70 | { 71 | for (size_t i = 0; i < row_group->count; i++) { 72 | struct cx_row_group_column *row_group_column = &row_group->columns[i]; 73 | if (row_group_column->lazy) { 74 | if (row_group_column->values.column) 75 | cx_column_free(row_group_column->values.column); 76 | if (row_group_column->nulls.column) 77 | cx_column_free(row_group_column->nulls.column); 78 | } else { 79 | cx_index_free(row_group_column->values.index); 80 | cx_index_free(row_group_column->nulls.index); 81 | } 82 | } 83 | free(row_group->columns); 84 | free(row_group); 85 | } 86 | 87 | static bool cx_row_group_ensure_column_size(struct cx_row_group *row_group) 88 | { 89 | if (row_group->count == row_group->size) { 90 | size_t new_size = row_group->size * 2; 91 | assert(new_size && new_size > row_group->size); 92 | struct cx_row_group_column *columns = 93 | realloc(row_group->columns, new_size * sizeof(*columns)); 94 | if (!columns) 95 | return false; 96 | row_group->columns = columns; 97 | row_group->size = new_size; 98 | } 99 | return true; 100 | } 101 | 102 | bool cx_row_group_add_column(struct cx_row_group *row_group, 103 | struct cx_column *column, struct cx_column *nulls) 104 | { 105 | if (!column) 106 | return false; 107 | size_t row_count = cx_column_count(column); 108 | if (row_count != cx_column_count(nulls) || 109 | cx_column_type(nulls) != CX_COLUMN_BIT) 110 | return false; 111 | if (row_group->count && row_group->row_count != row_count) 112 | return false; 113 | struct cx_index *index = cx_index_new(column); 114 | struct cx_index *nulls_index = cx_index_new(nulls); 115 | if (!index || !nulls_index) 116 | goto error; 117 | if (!cx_row_group_ensure_column_size(row_group)) 118 | goto error; 119 | struct cx_row_group_column *row_group_column = 120 | &row_group->columns[row_group->count++]; 121 | row_group_column->type = cx_column_type(column); 122 | row_group_column->encoding = cx_column_encoding(column); 123 | row_group_column->values.column = column; 124 | row_group_column->values.index = index; 125 | row_group_column->lazy = false; 126 | row_group_column->nulls.column = nulls; 127 | row_group_column->nulls.index = nulls_index; 128 | row_group->row_count = row_count; 129 | return true; 130 | error: 131 | if (index) 132 | cx_index_free(index); 133 | if (nulls_index) 134 | cx_index_free(nulls_index); 135 | return false; 136 | } 137 | 138 | bool cx_row_group_add_lazy_column(struct cx_row_group *row_group, 139 | const struct cx_lazy_column *column, 140 | const struct cx_lazy_column *nulls) 141 | { 142 | size_t row_count = column->index->count; 143 | if (row_count != nulls->index->count || nulls->type != CX_COLUMN_BIT) 144 | return false; 145 | if (row_group->count && row_group->row_count != row_count) 146 | return false; 147 | if (!cx_row_group_ensure_column_size(row_group)) 148 | return false; 149 | struct cx_row_group_column *row_group_column = 150 | &row_group->columns[row_group->count++]; 151 | row_group_column->type = column->type; 152 | row_group_column->encoding = column->encoding; 153 | row_group_column->values.index = (struct cx_index *)column->index; 154 | row_group_column->values.column = NULL; 155 | memcpy(&row_group_column->values.lazy_column, column, sizeof(*column)); 156 | row_group_column->lazy = true; 157 | row_group_column->nulls.column = NULL; 158 | row_group_column->nulls.index = (struct cx_index *)nulls->index; 159 | memcpy(&row_group_column->nulls.lazy_column, nulls, sizeof(*nulls)); 160 | row_group->row_count = row_count; 161 | return true; 162 | } 163 | 164 | size_t cx_row_group_column_count(const struct cx_row_group *row_group) 165 | { 166 | return row_group->count; 167 | } 168 | 169 | size_t cx_row_group_row_count(const struct cx_row_group *row_group) 170 | { 171 | if (!row_group->count) 172 | return 0; 173 | const struct cx_index *index = cx_row_group_column_index(row_group, 0); 174 | return index->count; 175 | } 176 | 177 | enum cx_column_type cx_row_group_column_type( 178 | const struct cx_row_group *row_group, size_t index) 179 | { 180 | assert(index < row_group->count); 181 | return row_group->columns[index].type; 182 | } 183 | 184 | enum cx_encoding_type cx_row_group_column_encoding( 185 | const struct cx_row_group *row_group, size_t index) 186 | { 187 | assert(index < row_group->count); 188 | return row_group->columns[index].encoding; 189 | } 190 | 191 | const struct cx_index *cx_row_group_column_index( 192 | const struct cx_row_group *row_group, size_t index) 193 | { 194 | assert(index < row_group->count); 195 | return row_group->columns[index].values.index; 196 | } 197 | 198 | const struct cx_index *cx_row_group_null_index( 199 | const struct cx_row_group *row_group, size_t index) 200 | { 201 | assert(index < row_group->count); 202 | return row_group->columns[index].nulls.index; 203 | } 204 | 205 | static bool cx_row_group_lazy_column_init( 206 | struct cx_row_group_physical_column *row_group_column) 207 | { 208 | struct cx_lazy_column *lazy = &row_group_column->lazy_column; 209 | struct cx_column *column = NULL; 210 | if (lazy->compression && lazy->size) { 211 | void *dest; 212 | column = cx_column_new_compressed(lazy->type, lazy->encoding, &dest, 213 | lazy->decompressed_size, 214 | row_group_column->index->count); 215 | if (!column) 216 | goto error; 217 | if (!cx_decompress(lazy->compression, lazy->ptr, lazy->size, dest, 218 | lazy->decompressed_size)) 219 | goto error; 220 | } else { 221 | column = 222 | cx_column_new_mmapped(lazy->type, lazy->encoding, lazy->ptr, 223 | lazy->size, row_group_column->index->count); 224 | if (!column) 225 | goto error; 226 | } 227 | row_group_column->column = column; 228 | return true; 229 | error: 230 | if (column) 231 | cx_column_free(column); 232 | return false; 233 | } 234 | 235 | const struct cx_column *cx_row_group_column( 236 | const struct cx_row_group *row_group, size_t index) 237 | { 238 | assert(index < row_group->count); 239 | struct cx_row_group_column *row_group_column = &row_group->columns[index]; 240 | if (row_group_column->lazy && !row_group_column->values.column) 241 | if (!cx_row_group_lazy_column_init(&row_group_column->values)) 242 | return NULL; 243 | return row_group_column->values.column; 244 | } 245 | 246 | const struct cx_column *cx_row_group_nulls(const struct cx_row_group *row_group, 247 | size_t index) 248 | { 249 | assert(index < row_group->count); 250 | struct cx_row_group_column *row_group_column = &row_group->columns[index]; 251 | if (row_group_column->lazy && !row_group_column->nulls.column) 252 | if (!cx_row_group_lazy_column_init(&row_group_column->nulls)) 253 | return NULL; 254 | return row_group_column->nulls.column; 255 | } 256 | 257 | struct cx_row_group_cursor *cx_row_group_cursor_new( 258 | struct cx_row_group *row_group) 259 | { 260 | size_t column_count = cx_row_group_column_count(row_group); 261 | size_t size = sizeof(struct cx_row_group_cursor) + 262 | column_count * sizeof(struct cx_row_group_cursor_column); 263 | struct cx_row_group_cursor *cursor = calloc(1, size); 264 | if (!cursor) 265 | return NULL; 266 | cursor->row_group = row_group; 267 | cursor->column_count = column_count; 268 | cursor->row_count = cx_row_group_row_count(row_group); 269 | return cursor; 270 | } 271 | 272 | void cx_row_group_cursor_free(struct cx_row_group_cursor *cursor) 273 | { 274 | cx_row_group_cursor_rewind(cursor); 275 | free(cursor); 276 | } 277 | 278 | static void cx_row_group_cursor_rewind_columns( 279 | struct cx_row_group_cursor_physical_column *column) 280 | { 281 | if (column->cursor) 282 | cx_column_cursor_free(column->cursor); 283 | column->cursor = NULL; 284 | column->position = 0; 285 | } 286 | 287 | void cx_row_group_cursor_rewind(struct cx_row_group_cursor *cursor) 288 | { 289 | cursor->initialized = false; 290 | cursor->position = 0; 291 | for (size_t i = 0; i < cursor->column_count; i++) { 292 | cx_row_group_cursor_rewind_columns(&cursor->columns[i].values); 293 | cx_row_group_cursor_rewind_columns(&cursor->columns[i].nulls); 294 | } 295 | } 296 | 297 | bool cx_row_group_cursor_next(struct cx_row_group_cursor *cursor) 298 | { 299 | if (!cursor->initialized) 300 | cursor->initialized = true; 301 | else 302 | cursor->position += CX_BATCH_SIZE; 303 | return cursor->position < cursor->row_count; 304 | } 305 | 306 | size_t cx_row_group_cursor_batch_count(const struct cx_row_group_cursor *cursor) 307 | { 308 | if (cursor->position >= cursor->row_count) 309 | return 0; 310 | size_t remaining = cursor->row_count - cursor->position; 311 | return remaining < CX_BATCH_SIZE ? remaining : CX_BATCH_SIZE; 312 | } 313 | 314 | static bool cx_row_group_cursor_lazy_column_init( 315 | struct cx_row_group_cursor *cursor, size_t column_index) 316 | { 317 | if (column_index >= cursor->column_count) 318 | return false; 319 | if (cursor->columns[column_index].values.cursor) 320 | return true; 321 | const struct cx_column *column = 322 | cx_row_group_column(cursor->row_group, column_index); 323 | if (!column) 324 | return false; 325 | cursor->columns[column_index].values.cursor = cx_column_cursor_new(column); 326 | return cursor->columns[column_index].values.cursor != NULL; 327 | } 328 | 329 | static bool cx_row_group_cursor_lazy_nulls_init( 330 | struct cx_row_group_cursor *cursor, size_t column_index) 331 | { 332 | if (column_index >= cursor->column_count) 333 | return false; 334 | if (cursor->columns[column_index].nulls.cursor) 335 | return true; 336 | const struct cx_column *column = 337 | cx_row_group_nulls(cursor->row_group, column_index); 338 | if (!column) 339 | return false; 340 | cursor->columns[column_index].nulls.cursor = cx_column_cursor_new(column); 341 | return cursor->columns[column_index].nulls.cursor != NULL; 342 | } 343 | 344 | const uint64_t *cx_row_group_cursor_batch_nulls( 345 | struct cx_row_group_cursor *cursor, size_t column_index, size_t *count) 346 | { 347 | if (!cx_row_group_cursor_lazy_nulls_init(cursor, column_index)) 348 | return NULL; 349 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 350 | if (column->nulls.position <= cursor->position) { 351 | size_t skipped = cx_column_cursor_skip_bit( 352 | column->nulls.cursor, cursor->position - column->nulls.position); 353 | column->nulls.position += skipped; 354 | column->nulls.batch = cx_column_cursor_next_batch_bit( 355 | column->nulls.cursor, &column->nulls.count); 356 | column->nulls.position += column->nulls.count; 357 | } 358 | *count = column->nulls.count; 359 | return column->nulls.batch; 360 | } 361 | 362 | const uint64_t *cx_row_group_cursor_batch_bit( 363 | struct cx_row_group_cursor *cursor, size_t column_index, size_t *count) 364 | { 365 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 366 | return NULL; 367 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 368 | if (column->values.position <= cursor->position) { 369 | size_t skipped = cx_column_cursor_skip_bit( 370 | column->values.cursor, cursor->position - column->values.position); 371 | column->values.position += skipped; 372 | column->values.batch = cx_column_cursor_next_batch_bit( 373 | column->values.cursor, &column->values.count); 374 | column->values.position += column->values.count; 375 | } 376 | *count = column->values.count; 377 | return column->values.batch; 378 | } 379 | 380 | const int32_t *cx_row_group_cursor_batch_i32(struct cx_row_group_cursor *cursor, 381 | size_t column_index, size_t *count) 382 | { 383 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 384 | return NULL; 385 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 386 | if (column->values.position <= cursor->position) { 387 | size_t skipped = cx_column_cursor_skip_i32( 388 | column->values.cursor, cursor->position - column->values.position); 389 | column->values.position += skipped; 390 | column->values.batch = cx_column_cursor_next_batch_i32( 391 | column->values.cursor, &column->values.count); 392 | column->values.position += column->values.count; 393 | } 394 | *count = column->values.count; 395 | return column->values.batch; 396 | } 397 | 398 | const int64_t *cx_row_group_cursor_batch_i64(struct cx_row_group_cursor *cursor, 399 | size_t column_index, size_t *count) 400 | { 401 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 402 | return NULL; 403 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 404 | if (column->values.position <= cursor->position) { 405 | size_t skipped = cx_column_cursor_skip_i64( 406 | column->values.cursor, cursor->position - column->values.position); 407 | column->values.position += skipped; 408 | column->values.batch = cx_column_cursor_next_batch_i64( 409 | column->values.cursor, &column->values.count); 410 | column->values.position += column->values.count; 411 | } 412 | *count = column->values.count; 413 | return column->values.batch; 414 | } 415 | 416 | const float *cx_row_group_cursor_batch_flt(struct cx_row_group_cursor *cursor, 417 | size_t column_index, size_t *count) 418 | { 419 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 420 | return NULL; 421 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 422 | if (column->values.position <= cursor->position) { 423 | size_t skipped = cx_column_cursor_skip_flt( 424 | column->values.cursor, cursor->position - column->values.position); 425 | column->values.position += skipped; 426 | column->values.batch = cx_column_cursor_next_batch_flt( 427 | column->values.cursor, &column->values.count); 428 | column->values.position += column->values.count; 429 | } 430 | *count = column->values.count; 431 | return column->values.batch; 432 | } 433 | 434 | const double *cx_row_group_cursor_batch_dbl(struct cx_row_group_cursor *cursor, 435 | size_t column_index, size_t *count) 436 | { 437 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 438 | return NULL; 439 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 440 | if (column->values.position <= cursor->position) { 441 | size_t skipped = cx_column_cursor_skip_dbl( 442 | column->values.cursor, cursor->position - column->values.position); 443 | column->values.position += skipped; 444 | column->values.batch = cx_column_cursor_next_batch_dbl( 445 | column->values.cursor, &column->values.count); 446 | column->values.position += column->values.count; 447 | } 448 | *count = column->values.count; 449 | return column->values.batch; 450 | } 451 | 452 | const struct cx_string *cx_row_group_cursor_batch_str( 453 | struct cx_row_group_cursor *cursor, size_t column_index, size_t *count) 454 | { 455 | if (!cx_row_group_cursor_lazy_column_init(cursor, column_index)) 456 | return NULL; 457 | struct cx_row_group_cursor_column *column = &cursor->columns[column_index]; 458 | if (column->values.position <= cursor->position) { 459 | size_t skipped = cx_column_cursor_skip_str( 460 | column->values.cursor, cursor->position - column->values.position); 461 | column->values.position += skipped; 462 | column->values.batch = cx_column_cursor_next_batch_str( 463 | column->values.cursor, &column->values.count); 464 | column->values.position += column->values.count; 465 | } 466 | *count = column->values.count; 467 | return column->values.batch; 468 | } 469 | -------------------------------------------------------------------------------- /lib/row_group.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_ROW_GROUP_H_ 2 | #define CX_ROW_GROUP_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "column.h" 9 | #include "index.h" 10 | 11 | struct cx_row_group; 12 | 13 | struct cx_row_group_cursor; 14 | 15 | struct cx_row_group *cx_row_group_new(void); 16 | 17 | void cx_row_group_free(struct cx_row_group *); 18 | 19 | bool cx_row_group_add_column(struct cx_row_group *, struct cx_column *column, 20 | struct cx_column *nulls); 21 | 22 | struct cx_lazy_column { 23 | enum cx_column_type type; 24 | enum cx_encoding_type encoding; 25 | enum cx_compression_type compression; 26 | const struct cx_index *index; 27 | const void *ptr; 28 | size_t size; 29 | size_t decompressed_size; 30 | }; 31 | 32 | bool cx_row_group_add_lazy_column(struct cx_row_group *, 33 | const struct cx_lazy_column *column, 34 | const struct cx_lazy_column *nulls); 35 | 36 | size_t cx_row_group_column_count(const struct cx_row_group *); 37 | 38 | size_t cx_row_group_row_count(const struct cx_row_group *); 39 | 40 | enum cx_column_type cx_row_group_column_type(const struct cx_row_group *, 41 | size_t); 42 | 43 | enum cx_encoding_type cx_row_group_column_encoding(const struct cx_row_group *, 44 | size_t); 45 | 46 | const struct cx_index *cx_row_group_column_index(const struct cx_row_group *, 47 | size_t); 48 | 49 | const struct cx_column *cx_row_group_column(const struct cx_row_group *, 50 | size_t); 51 | 52 | const struct cx_index *cx_row_group_null_index(const struct cx_row_group *, 53 | size_t); 54 | 55 | const struct cx_column *cx_row_group_nulls(const struct cx_row_group *, size_t); 56 | 57 | struct cx_row_group_cursor *cx_row_group_cursor_new(struct cx_row_group *); 58 | 59 | void cx_row_group_cursor_free(struct cx_row_group_cursor *); 60 | 61 | void cx_row_group_cursor_rewind(struct cx_row_group_cursor *); 62 | 63 | bool cx_row_group_cursor_next(struct cx_row_group_cursor *); 64 | 65 | size_t cx_row_group_cursor_batch_count(const struct cx_row_group_cursor *); 66 | 67 | const uint64_t *cx_row_group_cursor_batch_nulls(struct cx_row_group_cursor *, 68 | size_t column_index, 69 | size_t *count); 70 | const uint64_t *cx_row_group_cursor_batch_bit(struct cx_row_group_cursor *, 71 | size_t column_index, 72 | size_t *count); 73 | const int32_t *cx_row_group_cursor_batch_i32(struct cx_row_group_cursor *, 74 | size_t column_index, 75 | size_t *count); 76 | const int64_t *cx_row_group_cursor_batch_i64(struct cx_row_group_cursor *, 77 | size_t column_index, 78 | size_t *count); 79 | const float *cx_row_group_cursor_batch_flt(struct cx_row_group_cursor *, 80 | size_t column_index, size_t *count); 81 | const double *cx_row_group_cursor_batch_dbl(struct cx_row_group_cursor *, 82 | size_t column_index, size_t *count); 83 | const struct cx_string *cx_row_group_cursor_batch_str( 84 | struct cx_row_group_cursor *, size_t column_index, size_t *count); 85 | 86 | #ifdef __cplusplus 87 | } 88 | #endif 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /lib/version.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_VERSION_H_ 2 | #define CX_VERSION_H_ 3 | 4 | #define CX_VERSION_MAJOR 1 5 | #define CX_VERSION_MINOR 0 6 | #define CX_VERSION_RELEASE 0 7 | 8 | #define CX_VERSION CX_VERSION_MAJOR.CX_VERSION_MINOR.CX_VERSION_RELEASE 9 | 10 | #define CX_QUOTE(str) #str 11 | #define CX_EXPAND(str) CX_QUOTE(str) 12 | #define CX_VERSION_STRING CX_EXPAND(CX_VERSION) 13 | 14 | #define CX_VERSION_NUMBER \ 15 | (CX_VERSION_MAJOR * 100 * 100 + CX_VERSION_MINOR * 100 + CX_VERSION_RELEASE) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /lib/writer.h: -------------------------------------------------------------------------------- 1 | #ifndef CX_WRITER_H_ 2 | #define CX_WRITER_H_ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include "row_group.h" 9 | 10 | struct cx_writer; 11 | 12 | CX_EXPORT struct cx_writer *cx_writer_new(const char *, size_t row_group_size); 13 | 14 | CX_EXPORT void cx_writer_free(struct cx_writer *); 15 | 16 | CX_EXPORT bool cx_writer_metadata(struct cx_writer *, const char *); 17 | 18 | CX_EXPORT bool cx_writer_add_column(struct cx_writer *, const char *name, 19 | enum cx_column_type, enum cx_encoding_type, 20 | enum cx_compression_type, int level); 21 | 22 | CX_EXPORT bool cx_writer_put_bit(struct cx_writer *, size_t, bool); 23 | CX_EXPORT bool cx_writer_put_i32(struct cx_writer *, size_t, int32_t); 24 | CX_EXPORT bool cx_writer_put_i64(struct cx_writer *, size_t, int64_t); 25 | CX_EXPORT bool cx_writer_put_flt(struct cx_writer *, size_t, float); 26 | CX_EXPORT bool cx_writer_put_dbl(struct cx_writer *, size_t, double); 27 | CX_EXPORT bool cx_writer_put_str(struct cx_writer *, size_t, const char *); 28 | CX_EXPORT bool cx_writer_put_null(struct cx_writer *, size_t); 29 | 30 | CX_EXPORT bool cx_writer_finish(struct cx_writer *, bool sync); 31 | 32 | struct cx_row_group_writer; 33 | 34 | CX_EXPORT struct cx_row_group_writer *cx_row_group_writer_new(const char *); 35 | 36 | CX_EXPORT void cx_row_group_writer_free(struct cx_row_group_writer *); 37 | 38 | CX_EXPORT bool cx_row_group_writer_metadata(struct cx_row_group_writer *, 39 | const char *); 40 | 41 | CX_EXPORT bool cx_row_group_writer_add_column( 42 | struct cx_row_group_writer *, const char *name, enum cx_column_type, 43 | enum cx_encoding_type, enum cx_compression_type, int level); 44 | 45 | CX_EXPORT bool cx_row_group_writer_put(struct cx_row_group_writer *, 46 | struct cx_row_group *); 47 | 48 | CX_EXPORT bool cx_row_group_writer_finish(struct cx_row_group_writer *, 49 | bool sync); 50 | 51 | #ifdef __cplusplus 52 | } 53 | #endif 54 | 55 | #endif 56 | -------------------------------------------------------------------------------- /test/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS += -std=c99 -I../lib -g -pedantic -Wall 2 | 3 | ifneq ($(shell uname -s), Darwin) 4 | LDFLAGS += -pthread 5 | endif 6 | 7 | # link with the static lib. This allows us to test parts of 8 | # the library that are hidden via -fvisibility=hidden 9 | LDLIBS = ../lib/libcolumnix.a -llz4 -lzstd 10 | 11 | SRC := $(wildcard *.c) 12 | OBJ := $(SRC:.c=.o) 13 | 14 | RUNNER = runner 15 | 16 | test: $(RUNNER) 17 | ./$(RUNNER) $(grep) 18 | 19 | $(RUNNER): $(OBJ) 20 | $(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS) 21 | 22 | .c.o: 23 | $(CC) $(CFLAGS) -c -o $@ $< 24 | 25 | clean: 26 | $(RM) $(RUNNER) $(OBJ) 27 | 28 | valgrind: $(RUNNER) 29 | valgrind --leak-check=full ./$(RUNNER) --no-fork 30 | 31 | .PHONY: test clean valgrind 32 | -------------------------------------------------------------------------------- /test/column.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "column.h" 6 | 7 | #include "helpers.h" 8 | 9 | #define COUNT 1111 10 | 11 | static void *setup_bit(const MunitParameter params[], void *data) 12 | { 13 | struct cx_column *col = cx_column_new(CX_COLUMN_BIT, CX_ENCODING_NONE); 14 | assert_not_null(col); 15 | for (int32_t i = 0; i < COUNT; i++) 16 | assert_true(cx_column_put_bit(col, i % 5 == 0)); 17 | assert_int(cx_column_type(col), ==, CX_COLUMN_BIT); 18 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 19 | return col; 20 | } 21 | 22 | static void *setup_i32(const MunitParameter params[], void *data) 23 | { 24 | struct cx_column *col = cx_column_new(CX_COLUMN_I32, CX_ENCODING_NONE); 25 | assert_not_null(col); 26 | for (int32_t i = 0; i < COUNT; i++) 27 | assert_true(cx_column_put_i32(col, i)); 28 | assert_int(cx_column_type(col), ==, CX_COLUMN_I32); 29 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 30 | return col; 31 | } 32 | 33 | static void *setup_i64(const MunitParameter params[], void *data) 34 | { 35 | struct cx_column *col = cx_column_new(CX_COLUMN_I64, CX_ENCODING_NONE); 36 | assert_not_null(col); 37 | for (int32_t i = 0; i < COUNT; i++) 38 | assert_true(cx_column_put_i64(col, i)); 39 | assert_int(cx_column_type(col), ==, CX_COLUMN_I64); 40 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 41 | return col; 42 | } 43 | 44 | static void *setup_flt(const MunitParameter params[], void *data) 45 | { 46 | struct cx_column *col = cx_column_new(CX_COLUMN_FLT, CX_ENCODING_NONE); 47 | assert_not_null(col); 48 | for (int32_t i = 0; i < COUNT; i++) 49 | assert_true(cx_column_put_flt(col, (float)i)); 50 | assert_int(cx_column_type(col), ==, CX_COLUMN_FLT); 51 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 52 | return col; 53 | } 54 | 55 | static void *setup_dbl(const MunitParameter params[], void *data) 56 | { 57 | struct cx_column *col = cx_column_new(CX_COLUMN_DBL, CX_ENCODING_NONE); 58 | assert_not_null(col); 59 | for (int32_t i = 0; i < COUNT; i++) 60 | assert_true(cx_column_put_dbl(col, (double)i)); 61 | assert_int(cx_column_type(col), ==, CX_COLUMN_DBL); 62 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 63 | return col; 64 | } 65 | 66 | static void *setup_str(const MunitParameter params[], void *data) 67 | { 68 | struct cx_column *col = cx_column_new(CX_COLUMN_STR, CX_ENCODING_NONE); 69 | assert_not_null(col); 70 | char buffer[64]; 71 | for (size_t i = 0; i < COUNT; i++) { 72 | sprintf(buffer, "cx %zu", i); 73 | assert_true(cx_column_put_str(col, buffer)); 74 | } 75 | assert_int(cx_column_type(col), ==, CX_COLUMN_STR); 76 | assert_int(cx_column_encoding(col), ==, CX_ENCODING_NONE); 77 | return col; 78 | } 79 | 80 | static void teardown(void *fixture) 81 | { 82 | cx_column_free((struct cx_column *)fixture); 83 | } 84 | 85 | static MunitResult test_export(const MunitParameter params[], void *fixture) 86 | { 87 | struct cx_column *col = (struct cx_column *)fixture; 88 | size_t size; 89 | const int32_t *buf = cx_column_export(col, &size); 90 | assert_not_null(buf); 91 | assert_size(size, ==, sizeof(int32_t) * COUNT); 92 | for (int32_t i = 0; i < COUNT; i++) 93 | assert_int32(i, ==, buf[i]); 94 | return MUNIT_OK; 95 | } 96 | 97 | static void assert_i32_col_equal(const struct cx_column *a, 98 | const struct cx_column *b) 99 | { 100 | assert_not_null(a); 101 | assert_not_null(b); 102 | size_t a_size, b_size; 103 | const void *a_ptr = cx_column_export(a, &a_size); 104 | const void *b_ptr = cx_column_export(b, &b_size); 105 | assert_not_null(a_ptr); 106 | assert_not_null(b_ptr); 107 | assert_size(a_size, ==, b_size); 108 | assert_memory_equal(a_size, a_ptr, b_ptr); 109 | } 110 | 111 | static MunitResult test_import_mmapped(const MunitParameter params[], 112 | void *fixture) 113 | { 114 | struct cx_column *col = (struct cx_column *)fixture; 115 | size_t size; 116 | const void *ptr = cx_column_export(col, &size); 117 | assert_not_null(ptr); 118 | struct cx_column *copy = cx_column_new_mmapped( 119 | CX_COLUMN_I32, CX_ENCODING_NONE, ptr, size, cx_column_count(col)); 120 | assert_i32_col_equal(col, copy); 121 | assert_false(cx_column_put_i32(copy, 0)); // mmapped 122 | cx_column_free(copy); 123 | return MUNIT_OK; 124 | } 125 | 126 | static MunitResult test_import_compressed(const MunitParameter params[], 127 | void *fixture) 128 | { 129 | struct cx_column *col = (struct cx_column *)fixture; 130 | size_t size; 131 | const void *ptr = cx_column_export(col, &size); 132 | assert_not_null(ptr); 133 | void *dest; 134 | struct cx_column *copy = cx_column_new_compressed( 135 | CX_COLUMN_I32, CX_ENCODING_NONE, &dest, size, cx_column_count(col)); 136 | assert_not_null(copy); 137 | memcpy(dest, ptr, size); 138 | assert_i32_col_equal(col, copy); 139 | assert_true(cx_column_put_i32(copy, 0)); // mutable 140 | cx_column_free(copy); 141 | return MUNIT_OK; 142 | } 143 | 144 | static MunitResult test_bit_put_mismatch(const MunitParameter params[], 145 | void *fixture) 146 | { 147 | struct cx_column *col = (struct cx_column *)fixture; 148 | assert_false(cx_column_put_i64(col, 1)); 149 | return MUNIT_OK; 150 | } 151 | 152 | static MunitResult test_bit_cursor(const MunitParameter params[], void *fixture) 153 | { 154 | struct cx_column *col = (struct cx_column *)fixture; 155 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 156 | assert_not_null(cursor); 157 | 158 | size_t position, count; 159 | size_t starting_positions[] = {0, 64, 256, COUNT - (COUNT % 64)}; 160 | 161 | CX_FOREACH(starting_positions, position) 162 | { 163 | assert_size(cx_column_cursor_skip_bit(cursor, position), ==, position); 164 | while (cx_column_cursor_valid(cursor)) { 165 | const uint64_t *bitset = 166 | cx_column_cursor_next_batch_bit(cursor, &count); 167 | for (size_t i = 0; i < count; i++) { 168 | bool bit = *bitset & ((uint64_t)1 << i); 169 | if ((i + position) % 5 == 0) 170 | assert_true(bit); 171 | else 172 | assert_false(bit); 173 | } 174 | position += count; 175 | } 176 | assert_size(position, ==, COUNT); 177 | cx_column_cursor_rewind(cursor); 178 | } 179 | 180 | cx_column_cursor_free(cursor); 181 | return MUNIT_OK; 182 | } 183 | 184 | static MunitResult test_i32_put_mismatch(const MunitParameter params[], 185 | void *fixture) 186 | { 187 | struct cx_column *col = (struct cx_column *)fixture; 188 | assert_false(cx_column_put_i64(col, 1)); 189 | return MUNIT_OK; 190 | } 191 | 192 | static MunitResult test_i32_cursor(const MunitParameter params[], void *fixture) 193 | { 194 | struct cx_column *col = (struct cx_column *)fixture; 195 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 196 | assert_not_null(cursor); 197 | 198 | size_t position, count; 199 | size_t starting_positions[] = {0, 1, 8, 13, 64, 234, COUNT / 2 + 1, COUNT}; 200 | 201 | CX_FOREACH(starting_positions, position) 202 | { 203 | assert_size(cx_column_cursor_skip_i32(cursor, position), ==, position); 204 | while (cx_column_cursor_valid(cursor)) { 205 | const int32_t *values = 206 | cx_column_cursor_next_batch_i32(cursor, &count); 207 | for (int32_t j = 0; j < count; j++) 208 | assert_int32(values[j], ==, j + position); 209 | position += count; 210 | } 211 | assert_size(position, ==, COUNT); 212 | cx_column_cursor_rewind(cursor); 213 | } 214 | 215 | cx_column_cursor_free(cursor); 216 | return MUNIT_OK; 217 | } 218 | 219 | static MunitResult test_i64_put_mismatch(const MunitParameter params[], 220 | void *fixture) 221 | { 222 | struct cx_column *col = (struct cx_column *)fixture; 223 | assert_false(cx_column_put_i32(col, 1)); 224 | return MUNIT_OK; 225 | } 226 | 227 | static MunitResult test_i64_cursor(const MunitParameter params[], void *fixture) 228 | { 229 | struct cx_column *col = (struct cx_column *)fixture; 230 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 231 | assert_not_null(cursor); 232 | 233 | size_t position, count; 234 | size_t starting_positions[] = {0, 1, 8, 13, 64, 234, COUNT / 2 + 1, COUNT}; 235 | 236 | CX_FOREACH(starting_positions, position) 237 | { 238 | assert_size(cx_column_cursor_skip_i64(cursor, position), ==, position); 239 | while (cx_column_cursor_valid(cursor)) { 240 | const int64_t *values = 241 | cx_column_cursor_next_batch_i64(cursor, &count); 242 | for (int64_t j = 0; j < count; j++) 243 | assert_int64(values[j], ==, j + position); 244 | position += count; 245 | } 246 | assert_size(position, ==, COUNT); 247 | cx_column_cursor_rewind(cursor); 248 | } 249 | 250 | cx_column_cursor_free(cursor); 251 | return MUNIT_OK; 252 | } 253 | 254 | static MunitResult test_flt_put_mismatch(const MunitParameter params[], 255 | void *fixture) 256 | { 257 | struct cx_column *col = (struct cx_column *)fixture; 258 | assert_false(cx_column_put_i64(col, 1)); 259 | return MUNIT_OK; 260 | } 261 | 262 | static MunitResult test_flt_cursor(const MunitParameter params[], void *fixture) 263 | { 264 | struct cx_column *col = (struct cx_column *)fixture; 265 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 266 | assert_not_null(cursor); 267 | 268 | size_t position, count; 269 | size_t starting_positions[] = {0, 1, 8, 13, 64, 234, COUNT / 2 + 1, COUNT}; 270 | 271 | CX_FOREACH(starting_positions, position) 272 | { 273 | assert_size(cx_column_cursor_skip_flt(cursor, position), ==, position); 274 | while (cx_column_cursor_valid(cursor)) { 275 | const float *values = 276 | cx_column_cursor_next_batch_flt(cursor, &count); 277 | for (size_t j = 0; j < count; j++) 278 | assert_float(values[j], ==, (float)(j + position)); 279 | position += count; 280 | } 281 | assert_size(position, ==, COUNT); 282 | cx_column_cursor_rewind(cursor); 283 | } 284 | 285 | cx_column_cursor_free(cursor); 286 | return MUNIT_OK; 287 | } 288 | 289 | static MunitResult test_dbl_put_mismatch(const MunitParameter params[], 290 | void *fixture) 291 | { 292 | struct cx_column *col = (struct cx_column *)fixture; 293 | assert_false(cx_column_put_i64(col, 1)); 294 | return MUNIT_OK; 295 | } 296 | 297 | static MunitResult test_dbl_cursor(const MunitParameter params[], void *fixture) 298 | { 299 | struct cx_column *col = (struct cx_column *)fixture; 300 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 301 | assert_not_null(cursor); 302 | 303 | size_t position, count; 304 | size_t starting_positions[] = {0, 1, 8, 13, 64, 234, COUNT / 2 + 1, COUNT}; 305 | 306 | CX_FOREACH(starting_positions, position) 307 | { 308 | assert_size(cx_column_cursor_skip_dbl(cursor, position), ==, position); 309 | while (cx_column_cursor_valid(cursor)) { 310 | const double *values = 311 | cx_column_cursor_next_batch_dbl(cursor, &count); 312 | for (size_t j = 0; j < count; j++) 313 | assert_double(values[j], ==, (double)(j + position)); 314 | position += count; 315 | } 316 | assert_size(position, ==, COUNT); 317 | cx_column_cursor_rewind(cursor); 318 | } 319 | 320 | cx_column_cursor_free(cursor); 321 | return MUNIT_OK; 322 | } 323 | 324 | static MunitResult test_str_put_mismatch(const MunitParameter params[], 325 | void *fixture) 326 | { 327 | struct cx_column *col = (struct cx_column *)fixture; 328 | assert_false(cx_column_put_i32(col, 1)); 329 | return MUNIT_OK; 330 | } 331 | 332 | static MunitResult test_str_cursor(const MunitParameter params[], void *fixture) 333 | { 334 | struct cx_column *col = (struct cx_column *)fixture; 335 | struct cx_column_cursor *cursor = cx_column_cursor_new(col); 336 | assert_not_null(cursor); 337 | 338 | char expected[64]; 339 | size_t position, count; 340 | size_t starting_positions[] = {0, 1, 8, 13, 64, COUNT - 1, COUNT}; 341 | 342 | CX_FOREACH(starting_positions, position) 343 | { 344 | assert_size(cx_column_cursor_skip_str(cursor, position), ==, position); 345 | while (cx_column_cursor_valid(cursor)) { 346 | const struct cx_string *strings = 347 | cx_column_cursor_next_batch_str(cursor, &count); 348 | for (size_t j = 0; j < count; j++) { 349 | sprintf(expected, "cx %zu", j + position); 350 | assert_int(strings[j].len, ==, strlen(expected)); 351 | assert_string_equal(expected, strings[j].ptr); 352 | } 353 | position += count; 354 | } 355 | assert_size(position, ==, COUNT); 356 | cx_column_cursor_rewind(cursor); 357 | } 358 | 359 | cx_column_cursor_free(cursor); 360 | return MUNIT_OK; 361 | } 362 | 363 | MunitTest column_tests[] = { 364 | {"/export", test_export, setup_i32, teardown, MUNIT_TEST_OPTION_NONE, NULL}, 365 | {"/import-mmapped", test_import_mmapped, setup_i32, teardown, 366 | MUNIT_TEST_OPTION_NONE, NULL}, 367 | {"/import-compressed", test_import_compressed, setup_i32, teardown, 368 | MUNIT_TEST_OPTION_NONE, NULL}, 369 | {"/bit-put-mismatch", test_bit_put_mismatch, setup_bit, teardown, 370 | MUNIT_TEST_OPTION_NONE, NULL}, 371 | {"/bit-cursor", test_bit_cursor, setup_bit, teardown, 372 | MUNIT_TEST_OPTION_NONE, NULL}, 373 | {"/i32-put-mismatch", test_i32_put_mismatch, setup_i32, teardown, 374 | MUNIT_TEST_OPTION_NONE, NULL}, 375 | {"/i32-cursor", test_i32_cursor, setup_i32, teardown, 376 | MUNIT_TEST_OPTION_NONE, NULL}, 377 | {"/flt-put-mismatch", test_flt_put_mismatch, setup_flt, teardown, 378 | MUNIT_TEST_OPTION_NONE, NULL}, 379 | {"/flt-cursor", test_flt_cursor, setup_flt, teardown, 380 | MUNIT_TEST_OPTION_NONE, NULL}, 381 | {"/dbl-put-mismatch", test_dbl_put_mismatch, setup_dbl, teardown, 382 | MUNIT_TEST_OPTION_NONE, NULL}, 383 | {"/dbl-cursor", test_dbl_cursor, setup_dbl, teardown, 384 | MUNIT_TEST_OPTION_NONE, NULL}, 385 | {"/i64-put-mismatch", test_i64_put_mismatch, setup_i64, teardown, 386 | MUNIT_TEST_OPTION_NONE, NULL}, 387 | {"/i64-cursor", test_i64_cursor, setup_i64, teardown, 388 | MUNIT_TEST_OPTION_NONE, NULL}, 389 | {"/str-put-mismatch", test_str_put_mismatch, setup_str, teardown, 390 | MUNIT_TEST_OPTION_NONE, NULL}, 391 | {"/str-cursor", test_str_cursor, setup_str, teardown, 392 | MUNIT_TEST_OPTION_NONE, NULL}, 393 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 394 | -------------------------------------------------------------------------------- /test/compress.c: -------------------------------------------------------------------------------- 1 | #include "compress.h" 2 | 3 | #include "helpers.h" 4 | 5 | #define BUFFER_SIZE 11111 6 | 7 | static void *setup(const MunitParameter params[], void *data) 8 | { 9 | unsigned char *buffer = malloc(BUFFER_SIZE); 10 | assert_not_null(buffer); 11 | for (size_t i = 0; i < BUFFER_SIZE; i++) 12 | buffer[i] = i / 10; 13 | return buffer; 14 | } 15 | 16 | void test_compression(unsigned char *buffer, enum cx_compression_type type, 17 | int level) 18 | { 19 | size_t compressed_size; 20 | void *compressed = 21 | cx_compress(type, level, buffer, BUFFER_SIZE, &compressed_size); 22 | assert_not_null(compressed); 23 | assert_size(BUFFER_SIZE, >, compressed_size); 24 | for (size_t i = 0; i < BUFFER_SIZE; i++) 25 | buffer[i] = 0; 26 | assert_true( 27 | cx_decompress(type, compressed, compressed_size, buffer, BUFFER_SIZE)); 28 | for (size_t i = 0; i < BUFFER_SIZE; i++) 29 | assert_uchar(buffer[i], ==, i / 10); 30 | free(compressed); 31 | } 32 | 33 | static MunitResult test_lz4(const MunitParameter params[], void *buffer) 34 | { 35 | test_compression(buffer, CX_COMPRESSION_LZ4, 1); 36 | test_compression(buffer, CX_COMPRESSION_LZ4, 15); 37 | return MUNIT_OK; 38 | } 39 | 40 | static MunitResult test_zstd(const MunitParameter params[], void *buffer) 41 | { 42 | test_compression(buffer, CX_COMPRESSION_ZSTD, 1); 43 | test_compression(buffer, CX_COMPRESSION_ZSTD, 15); 44 | return MUNIT_OK; 45 | } 46 | 47 | MunitTest compress_tests[] = { 48 | {"/lz4", test_lz4, setup, free, MUNIT_TEST_OPTION_NONE, NULL}, 49 | {"/zstd", test_zstd, setup, free, MUNIT_TEST_OPTION_NONE, NULL}, 50 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 51 | -------------------------------------------------------------------------------- /test/file.c: -------------------------------------------------------------------------------- 1 | #define _BSD_SOURCE 2 | #include 3 | 4 | #include "reader.h" 5 | #include "row.h" 6 | #include "writer.h" 7 | 8 | #include "helpers.h" 9 | #include "temp_file.h" 10 | 11 | #define COLUMN_COUNT 6 12 | #define ROW_GROUP_COUNT 4 13 | #define ROWS_PER_ROW_GROUP 25 14 | #define ROW_COUNT (ROW_GROUP_COUNT * ROWS_PER_ROW_GROUP) 15 | 16 | struct cx_file_fixture { 17 | char *temp_file; 18 | struct cx_predicate *true_predicate; 19 | }; 20 | 21 | static enum cx_compression_type cx_compression_types[] = { 22 | CX_COMPRESSION_NONE, CX_COMPRESSION_LZ4, CX_COMPRESSION_LZ4HC, 23 | CX_COMPRESSION_ZSTD}; 24 | 25 | static void *setup(const MunitParameter params[], void *data) 26 | { 27 | struct cx_file_fixture *fixture = malloc(sizeof(*fixture)); 28 | assert_not_null(fixture); 29 | 30 | fixture->temp_file = cx_temp_file_new(); 31 | assert_not_null(fixture->temp_file); 32 | 33 | fixture->true_predicate = cx_predicate_new_true(); 34 | assert_not_null(fixture->true_predicate); 35 | 36 | return fixture; 37 | } 38 | 39 | static void teardown(void *ptr) 40 | { 41 | struct cx_file_fixture *fixture = ptr; 42 | cx_temp_file_free(fixture->temp_file); 43 | cx_predicate_free(fixture->true_predicate); 44 | free(fixture); 45 | } 46 | 47 | static void count_rows(struct cx_row_cursor *cursor, pthread_mutex_t *mutex, 48 | void *data) 49 | { 50 | size_t *count = data; 51 | size_t row_group_count = cx_row_cursor_count(cursor); 52 | pthread_mutex_lock(mutex); 53 | *count += row_group_count; 54 | pthread_mutex_unlock(mutex); 55 | } 56 | 57 | static MunitResult test_read_write(const MunitParameter params[], void *ptr) 58 | { 59 | struct cx_file_fixture *fixture = ptr; 60 | 61 | enum cx_compression_type compression; 62 | int level = 5; 63 | char buffer[64]; 64 | 65 | enum cx_column_type types[] = {CX_COLUMN_I32, CX_COLUMN_I64, CX_COLUMN_BIT, 66 | CX_COLUMN_STR, CX_COLUMN_FLT, CX_COLUMN_DBL}; 67 | 68 | CX_FOREACH(cx_compression_types, compression) 69 | { 70 | struct cx_writer *writer = 71 | cx_writer_new(fixture->temp_file, ROWS_PER_ROW_GROUP); 72 | assert_not_null(writer); 73 | 74 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 75 | sprintf(buffer, "column %zu", i); 76 | assert_true(cx_writer_add_column(writer, buffer, types[i], 0, 77 | compression, level)); 78 | } 79 | 80 | for (size_t i = 0; i < ROW_COUNT; i++) { 81 | assert_true(cx_writer_put_i32(writer, 0, i)); 82 | assert_true(cx_writer_put_i64(writer, 1, i * 10)); 83 | assert_true(cx_writer_put_bit(writer, 2, i % 3 == 0)); 84 | sprintf(buffer, "cx %zu", i); 85 | 86 | if (i % 12 == 0) 87 | assert_true(cx_writer_put_null(writer, 3)); 88 | else 89 | assert_true(cx_writer_put_str(writer, 3, buffer)); 90 | 91 | assert_true(cx_writer_put_flt(writer, 4, (float)i / 10)); 92 | assert_true(cx_writer_put_dbl(writer, 5, (double)i / 100)); 93 | } 94 | 95 | assert_true(cx_writer_finish(writer, true)); 96 | 97 | assert_true(cx_writer_finish(writer, true)); // noop 98 | 99 | cx_writer_free(writer); 100 | 101 | // high-level reader 102 | struct cx_reader *reader = cx_reader_new(fixture->temp_file); 103 | assert_not_null(reader); 104 | assert_size(cx_reader_column_count(reader), ==, COLUMN_COUNT); 105 | assert_size(cx_reader_row_count(reader), ==, ROW_COUNT); 106 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 107 | sprintf(buffer, "column %zu", i); 108 | const char *name = cx_reader_column_name(reader, i); 109 | assert_not_null(name); 110 | assert_string_equal(name, buffer); 111 | 112 | int compression_level = 0; 113 | assert_int(cx_reader_column_type(reader, i), ==, types[i]); 114 | assert_int(cx_reader_column_encoding(reader, i), ==, 0); 115 | assert_int( 116 | cx_reader_column_compression(reader, i, &compression_level), ==, 117 | compression); 118 | assert_int(compression_level, ==, level); 119 | } 120 | size_t position = 0; 121 | for (; cx_reader_next(reader); position++) { 122 | cx_value_t value; 123 | assert_true(cx_reader_get_i32(reader, 0, &value.i32)); 124 | assert_int32(value.i32, ==, position); 125 | assert_true(cx_reader_get_i64(reader, 1, &value.i64)); 126 | assert_int64(value.i64, ==, position * 10); 127 | assert_true(cx_reader_get_bit(reader, 2, &value.bit)); 128 | if (position % 3 == 0) 129 | assert_true(value.bit); 130 | else 131 | assert_false(value.bit); 132 | 133 | assert_true(cx_reader_get_null(reader, 3, &value.bit)); 134 | if (position % 12 == 0) { 135 | assert_true(value.bit); 136 | } else { 137 | assert_false(value.bit); 138 | assert_true(cx_reader_get_str(reader, 3, &value.str)); 139 | sprintf(buffer, "cx %zu", position); 140 | assert_int(value.str.len, ==, strlen(buffer)); 141 | assert_string_equal(buffer, value.str.ptr); 142 | } 143 | assert_true(cx_reader_get_flt(reader, 4, &value.flt)); 144 | assert_float(value.flt, ==, (float)position / 10); 145 | assert_true(cx_reader_get_dbl(reader, 5, &value.dbl)); 146 | assert_float(value.dbl, ==, (double)position / 100); 147 | } 148 | assert_false(cx_reader_error(reader)); 149 | assert_size(position, ==, ROW_COUNT); 150 | size_t count = 0; 151 | assert_true(cx_reader_query(reader, 4, (void *)&count, count_rows)); 152 | assert_size(count, ==, ROW_COUNT); 153 | cx_reader_free(reader); 154 | 155 | // high-level reader matching rows 156 | struct cx_predicate *predicate = cx_predicate_new_and( 157 | 5, cx_predicate_new_i32_gt(0, 20), cx_predicate_new_i64_lt(1, 900), 158 | cx_predicate_new_bit_eq(2, true), 159 | cx_predicate_negate(cx_predicate_new_null(3)), 160 | cx_predicate_new_str_contains(3, "0", false, CX_STR_LOCATION_END)); 161 | assert_not_null(predicate); 162 | reader = cx_reader_new_matching(fixture->temp_file, predicate); 163 | assert_not_null(reader); 164 | assert_size(cx_reader_column_count(reader), ==, COLUMN_COUNT); 165 | assert_size(cx_reader_row_count(reader), ==, 1); 166 | assert_false(cx_reader_error(reader)); 167 | cx_reader_rewind(reader); 168 | int32_t value; 169 | assert_true(cx_reader_next(reader)); 170 | assert_true(cx_reader_get_i32(reader, 0, &value)); 171 | assert_int(value, ==, 30); 172 | assert_false(cx_reader_next(reader)); 173 | assert_false(cx_reader_error(reader)); 174 | cx_reader_free(reader); 175 | 176 | // low-level reader 177 | struct cx_row_group_reader *row_group_reader = 178 | cx_row_group_reader_new(fixture->temp_file); 179 | assert_not_null(row_group_reader); 180 | assert_size(cx_row_group_reader_row_group_count(row_group_reader), ==, 181 | ROW_GROUP_COUNT); 182 | assert_size(cx_row_group_reader_column_count(row_group_reader), ==, 183 | COLUMN_COUNT); 184 | assert_size(cx_row_group_reader_row_count(row_group_reader), ==, 185 | ROW_COUNT); 186 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 187 | assert_int(cx_row_group_reader_column_type(row_group_reader, i), ==, 188 | types[i]); 189 | assert_int(cx_row_group_reader_column_encoding(row_group_reader, i), 190 | ==, 0); 191 | 192 | int compression_level = 0; 193 | assert_int(cx_row_group_reader_column_compression( 194 | row_group_reader, i, &compression_level), 195 | ==, compression); 196 | assert_int(compression_level, ==, level); 197 | } 198 | position = 0; 199 | for (size_t i = 0; i < ROW_GROUP_COUNT; i++) { 200 | struct cx_row_group *row_group = 201 | cx_row_group_reader_get(row_group_reader, i); 202 | assert_not_null(row_group); 203 | struct cx_row_cursor *cursor = 204 | cx_row_cursor_new(row_group, fixture->true_predicate); 205 | assert_not_null(cursor); 206 | for (; cx_row_cursor_next(cursor); position++) { 207 | cx_value_t value; 208 | assert_true(cx_row_cursor_get_i32(cursor, 0, &value.i32)); 209 | assert_int32(value.i32, ==, position); 210 | assert_true(cx_row_cursor_get_i64(cursor, 1, &value.i64)); 211 | assert_int64(value.i64, ==, position * 10); 212 | assert_true(cx_row_cursor_get_bit(cursor, 2, &value.bit)); 213 | if (position % 3 == 0) 214 | assert_true(value.bit); 215 | else 216 | assert_false(value.bit); 217 | assert_true(cx_row_cursor_get_null(cursor, 3, &value.bit)); 218 | if (position % 12 == 0) { 219 | assert_true(value.bit); 220 | } else { 221 | assert_false(value.bit); 222 | assert_true(cx_row_cursor_get_str(cursor, 3, &value.str)); 223 | sprintf(buffer, "cx %zu", position); 224 | assert_int(value.str.len, ==, strlen(buffer)); 225 | assert_string_equal(buffer, value.str.ptr); 226 | } 227 | assert_true(cx_row_cursor_get_flt(cursor, 4, &value.flt)); 228 | assert_float(value.flt, ==, (float)position / 10); 229 | assert_true(cx_row_cursor_get_dbl(cursor, 5, &value.dbl)); 230 | assert_float(value.dbl, ==, (double)position / 100); 231 | } 232 | cx_row_cursor_free(cursor); 233 | cx_row_group_free(row_group); 234 | } 235 | assert_size(position, ==, ROW_GROUP_COUNT * ROWS_PER_ROW_GROUP); 236 | cx_row_group_reader_free(row_group_reader); 237 | } 238 | 239 | return MUNIT_OK; 240 | } 241 | 242 | static MunitResult test_no_row_groups(const MunitParameter params[], void *ptr) 243 | { 244 | struct cx_file_fixture *fixture = ptr; 245 | 246 | struct cx_row_group_writer *writer = 247 | cx_row_group_writer_new(fixture->temp_file); 248 | assert_not_null(writer); 249 | 250 | for (size_t i = 0; i < COLUMN_COUNT; i++) 251 | assert_true(cx_row_group_writer_add_column(writer, "foo", CX_COLUMN_I32, 252 | 0, CX_COMPRESSION_NONE, 0)); 253 | 254 | assert_true(cx_row_group_writer_finish(writer, true)); 255 | 256 | cx_row_group_writer_free(writer); 257 | 258 | // high-level reader 259 | struct cx_reader *reader = cx_reader_new(fixture->temp_file); 260 | assert_not_null(reader); 261 | assert_size(cx_reader_column_count(reader), ==, COLUMN_COUNT); 262 | assert_size(cx_reader_row_count(reader), ==, 0); 263 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 264 | assert_int(cx_reader_column_type(reader, i), ==, CX_COLUMN_I32); 265 | assert_int(cx_reader_column_encoding(reader, i), ==, 0); 266 | } 267 | cx_reader_free(reader); 268 | 269 | // low-level reader 270 | struct cx_row_group_reader *row_group_reader = 271 | cx_row_group_reader_new(fixture->temp_file); 272 | assert_not_null(row_group_reader); 273 | assert_size(cx_row_group_reader_row_group_count(row_group_reader), ==, 0); 274 | assert_size(cx_row_group_reader_column_count(row_group_reader), ==, 275 | COLUMN_COUNT); 276 | assert_size(cx_row_group_reader_row_count(row_group_reader), ==, 0); 277 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 278 | assert_int(cx_row_group_reader_column_type(row_group_reader, i), ==, 279 | CX_COLUMN_I32); 280 | assert_int(cx_row_group_reader_column_encoding(row_group_reader, i), ==, 281 | 0); 282 | } 283 | cx_row_group_reader_free(row_group_reader); 284 | 285 | return MUNIT_OK; 286 | } 287 | 288 | static MunitResult test_no_columns(const MunitParameter params[], void *ptr) 289 | { 290 | struct cx_file_fixture *fixture = ptr; 291 | struct cx_row_group_writer *writer = 292 | cx_row_group_writer_new(fixture->temp_file); 293 | assert_not_null(writer); 294 | assert_true(cx_row_group_writer_finish(writer, true)); 295 | cx_row_group_writer_free(writer); 296 | 297 | // high-level reader 298 | struct cx_reader *reader = cx_reader_new(fixture->temp_file); 299 | assert_not_null(reader); 300 | assert_size(cx_reader_column_count(reader), ==, 0); 301 | assert_size(cx_reader_row_count(reader), ==, 0); 302 | assert_false(cx_reader_next(reader)); 303 | assert_false(cx_reader_error(reader)); 304 | cx_reader_free(reader); 305 | 306 | // low-level reader 307 | struct cx_row_group_reader *row_group_reader = 308 | cx_row_group_reader_new(fixture->temp_file); 309 | assert_not_null(row_group_reader); 310 | assert_size(cx_row_group_reader_row_group_count(row_group_reader), ==, 0); 311 | assert_size(cx_row_group_reader_column_count(row_group_reader), ==, 0); 312 | assert_size(cx_row_group_reader_row_count(row_group_reader), ==, 0); 313 | cx_row_group_reader_free(row_group_reader); 314 | 315 | return MUNIT_OK; 316 | } 317 | 318 | static MunitResult test_empty_columns(const MunitParameter params[], void *ptr) 319 | { 320 | struct cx_file_fixture *fixture = ptr; 321 | 322 | struct cx_predicate *predicate = cx_predicate_new_true(); 323 | assert_not_null(predicate); 324 | 325 | enum cx_compression_type compression; 326 | int level = 5; 327 | 328 | CX_FOREACH(cx_compression_types, compression) 329 | { 330 | struct cx_row_group_writer *writer = 331 | cx_row_group_writer_new(fixture->temp_file); 332 | assert_not_null(writer); 333 | struct cx_row_group *row_group = cx_row_group_new(); 334 | assert_not_null(row_group); 335 | struct cx_column *column = cx_column_new(CX_COLUMN_I32, 0); 336 | assert_not_null(column); 337 | struct cx_column *nulls = cx_column_new(CX_COLUMN_BIT, 0); 338 | assert_not_null(nulls); 339 | assert_true(cx_row_group_writer_add_column(writer, "foo", CX_COLUMN_I32, 340 | 0, compression, level)); 341 | assert_true(cx_row_group_add_column(row_group, column, nulls)); 342 | assert_true(cx_row_group_writer_put(writer, row_group)); 343 | assert_true(cx_row_group_writer_finish(writer, true)); 344 | cx_row_group_writer_free(writer); 345 | cx_row_group_free(row_group); 346 | cx_column_free(column); 347 | cx_column_free(nulls); 348 | 349 | // high-level reader 350 | struct cx_reader *reader = cx_reader_new(fixture->temp_file); 351 | assert_not_null(reader); 352 | const char *name = cx_reader_column_name(reader, 0); 353 | assert_not_null(name); 354 | assert_string_equal(name, "foo"); 355 | assert_size(cx_reader_column_count(reader), ==, 1); 356 | assert_size(cx_reader_row_count(reader), ==, 0); 357 | assert_false(cx_reader_next(reader)); 358 | assert_false(cx_reader_error(reader)); 359 | cx_reader_free(reader); 360 | 361 | // low-level reader 362 | struct cx_row_group_reader *row_group_reader = 363 | cx_row_group_reader_new(fixture->temp_file); 364 | assert_not_null(reader); 365 | assert_size(cx_row_group_reader_row_group_count(row_group_reader), ==, 366 | 1); 367 | assert_size(cx_row_group_reader_column_count(row_group_reader), ==, 1); 368 | assert_size(cx_row_group_reader_row_count(row_group_reader), ==, 0); 369 | row_group = cx_row_group_reader_get(row_group_reader, 0); 370 | assert_not_null(row_group); 371 | struct cx_row_cursor *cursor = cx_row_cursor_new(row_group, predicate); 372 | assert_not_null(cursor); 373 | assert_false(cx_row_cursor_next(cursor)); 374 | assert_false(cx_row_cursor_error(cursor)); 375 | cx_row_cursor_free(cursor); 376 | cx_row_group_free(row_group); 377 | cx_row_group_reader_free(row_group_reader); 378 | } 379 | 380 | cx_predicate_free(predicate); 381 | 382 | return MUNIT_OK; 383 | } 384 | 385 | static MunitResult test_metadata(const MunitParameter params[], void *ptr) 386 | { 387 | struct cx_file_fixture *fixture = ptr; 388 | 389 | // no metadata 390 | struct cx_writer *writer = cx_writer_new(fixture->temp_file, 1); 391 | assert_not_null(writer); 392 | assert_true(cx_writer_finish(writer, true)); 393 | cx_writer_free(writer); 394 | struct cx_reader *reader = cx_reader_new(fixture->temp_file); 395 | assert_not_null(reader); 396 | const char *metadata = NULL; 397 | assert_true(cx_reader_metadata(reader, &metadata)); 398 | assert_null(metadata); 399 | cx_reader_free(reader); 400 | 401 | // metadata 402 | writer = cx_writer_new(fixture->temp_file, 1); 403 | assert_not_null(writer); 404 | cx_writer_metadata(writer, "foo"); 405 | cx_writer_metadata(writer, "abcxyz"); 406 | cx_writer_metadata(writer, "bar"); // last one wins 407 | assert_true(cx_writer_finish(writer, true)); 408 | cx_writer_free(writer); 409 | reader = cx_reader_new(fixture->temp_file); 410 | assert_not_null(reader); 411 | metadata = NULL; 412 | assert_true(cx_reader_metadata(reader, &metadata)); 413 | assert_string_equal(metadata, "bar"); 414 | cx_reader_free(reader); 415 | 416 | return MUNIT_OK; 417 | } 418 | 419 | MunitTest file_tests[] = { 420 | {"/read-write", test_read_write, setup, teardown, MUNIT_TEST_OPTION_NONE, 421 | NULL}, 422 | {"/no-row-groups", test_no_row_groups, setup, teardown, 423 | MUNIT_TEST_OPTION_NONE, NULL}, 424 | {"/no-columns", test_no_columns, setup, teardown, MUNIT_TEST_OPTION_NONE, 425 | NULL}, 426 | {"/empty-columns", test_empty_columns, setup, teardown, 427 | MUNIT_TEST_OPTION_NONE, NULL}, 428 | {"/metadata", test_metadata, setup, teardown, MUNIT_TEST_OPTION_NONE, NULL}, 429 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 430 | -------------------------------------------------------------------------------- /test/helpers.h: -------------------------------------------------------------------------------- 1 | #define MUNIT_ENABLE_ASSERT_ALIASES 2 | #include "munit.h" 3 | 4 | #define CX_FOREACH(array, item) \ 5 | for (size_t __i = 0; \ 6 | __i < sizeof(array) / sizeof(*(array)) ? (((item) = (array)[__i]), 1) \ 7 | : 0; \ 8 | __i++) 9 | -------------------------------------------------------------------------------- /test/index.c: -------------------------------------------------------------------------------- 1 | #define __STDC_LIMIT_MACROS 2 | #include 3 | #include 4 | 5 | #include "index.h" 6 | 7 | #include "helpers.h" 8 | 9 | static MunitResult test_bit_index(const MunitParameter params[], void *fixture) 10 | { 11 | struct cx_column *col = cx_column_new(CX_COLUMN_BIT, CX_ENCODING_NONE); 12 | assert_not_null(col); 13 | 14 | struct cx_index *index = cx_index_new(col); 15 | assert_uint64(index->count, ==, 0); 16 | cx_index_free(index); 17 | 18 | assert_true(cx_column_put_bit(col, false)); 19 | index = cx_index_new(col); 20 | assert_uint64(index->count, ==, 1); 21 | assert_uint64(index->min.bit, ==, false); 22 | assert_uint64(index->max.bit, ==, false); 23 | cx_index_free(index); 24 | 25 | assert_true(cx_column_put_bit(col, false)); 26 | index = cx_index_new(col); 27 | assert_uint64(index->count, ==, 2); 28 | assert_uint64(index->min.bit, ==, false); 29 | assert_uint64(index->max.bit, ==, false); 30 | cx_index_free(index); 31 | 32 | assert_true(cx_column_put_bit(col, true)); 33 | index = cx_index_new(col); 34 | assert_uint64(index->count, ==, 3); 35 | assert_uint64(index->min.bit, ==, false); 36 | assert_uint64(index->max.bit, ==, true); 37 | cx_index_free(index); 38 | 39 | cx_column_free(col); 40 | 41 | col = cx_column_new(CX_COLUMN_BIT, CX_ENCODING_NONE); 42 | assert_not_null(col); 43 | 44 | index = cx_index_new(col); 45 | assert_uint64(index->count, ==, 0); 46 | cx_index_free(index); 47 | 48 | assert_true(cx_column_put_bit(col, true)); 49 | index = cx_index_new(col); 50 | assert_uint64(index->count, ==, 1); 51 | assert_uint64(index->min.bit, ==, true); 52 | assert_uint64(index->max.bit, ==, true); 53 | cx_index_free(index); 54 | 55 | assert_true(cx_column_put_bit(col, true)); 56 | index = cx_index_new(col); 57 | assert_uint64(index->count, ==, 2); 58 | assert_uint64(index->min.bit, ==, true); 59 | assert_uint64(index->max.bit, ==, true); 60 | cx_index_free(index); 61 | 62 | assert_true(cx_column_put_bit(col, false)); 63 | index = cx_index_new(col); 64 | assert_uint64(index->count, ==, 3); 65 | assert_uint64(index->min.bit, ==, false); 66 | assert_uint64(index->max.bit, ==, true); 67 | cx_index_free(index); 68 | 69 | cx_column_free(col); 70 | return MUNIT_OK; 71 | } 72 | 73 | static MunitResult test_i32_index(const MunitParameter params[], void *fixture) 74 | { 75 | struct cx_column *col = cx_column_new(CX_COLUMN_I32, CX_ENCODING_NONE); 76 | assert_not_null(col); 77 | 78 | struct cx_index *index = cx_index_new(col); 79 | assert_uint64(index->count, ==, 0); 80 | cx_index_free(index); 81 | 82 | assert_true(cx_column_put_i32(col, 10)); 83 | index = cx_index_new(col); 84 | assert_uint64(index->count, ==, 1); 85 | assert_uint64(index->min.i32, ==, 10); 86 | assert_uint64(index->max.i32, ==, 10); 87 | cx_index_free(index); 88 | 89 | assert_true(cx_column_put_i32(col, 20)); 90 | index = cx_index_new(col); 91 | assert_uint64(index->count, ==, 2); 92 | assert_uint64(index->min.i32, ==, 10); 93 | assert_uint64(index->max.i32, ==, 20); 94 | cx_index_free(index); 95 | 96 | assert_true(cx_column_put_i32(col, 15)); 97 | index = cx_index_new(col); 98 | assert_uint64(index->count, ==, 3); 99 | assert_uint64(index->min.i32, ==, 10); 100 | assert_uint64(index->max.i32, ==, 20); 101 | cx_index_free(index); 102 | 103 | assert_true(cx_column_put_i32(col, INT32_MAX)); 104 | index = cx_index_new(col); 105 | assert_uint64(index->count, ==, 4); 106 | assert_uint64(index->min.i32, ==, 10); 107 | assert_uint64(index->max.i32, ==, INT32_MAX); 108 | cx_index_free(index); 109 | 110 | assert_true(cx_column_put_i32(col, 0)); 111 | index = cx_index_new(col); 112 | assert_uint64(index->count, ==, 5); 113 | assert_uint64(index->min.i32, ==, 0); 114 | assert_uint64(index->max.i32, ==, INT32_MAX); 115 | cx_index_free(index); 116 | 117 | assert_true(cx_column_put_i32(col, INT32_MIN)); 118 | index = cx_index_new(col); 119 | assert_uint64(index->count, ==, 6); 120 | assert_uint64(index->min.i32, ==, INT32_MIN); 121 | assert_uint64(index->max.i32, ==, INT32_MAX); 122 | cx_index_free(index); 123 | 124 | cx_column_free(col); 125 | return MUNIT_OK; 126 | } 127 | 128 | static MunitResult test_i64_index(const MunitParameter params[], void *fixture) 129 | { 130 | struct cx_column *col = cx_column_new(CX_COLUMN_I64, CX_ENCODING_NONE); 131 | assert_not_null(col); 132 | 133 | struct cx_index *index = cx_index_new(col); 134 | assert_uint64(index->count, ==, 0); 135 | cx_index_free(index); 136 | 137 | assert_true(cx_column_put_i64(col, 10)); 138 | index = cx_index_new(col); 139 | assert_uint64(index->count, ==, 1); 140 | assert_uint64(index->min.i64, ==, 10); 141 | assert_uint64(index->max.i64, ==, 10); 142 | cx_index_free(index); 143 | 144 | assert_true(cx_column_put_i64(col, 20)); 145 | index = cx_index_new(col); 146 | assert_uint64(index->count, ==, 2); 147 | assert_uint64(index->min.i64, ==, 10); 148 | assert_uint64(index->max.i64, ==, 20); 149 | cx_index_free(index); 150 | 151 | assert_true(cx_column_put_i64(col, 15)); 152 | index = cx_index_new(col); 153 | assert_uint64(index->count, ==, 3); 154 | assert_uint64(index->min.i64, ==, 10); 155 | assert_uint64(index->max.i64, ==, 20); 156 | cx_index_free(index); 157 | 158 | assert_true(cx_column_put_i64(col, INT32_MAX)); 159 | index = cx_index_new(col); 160 | assert_uint64(index->count, ==, 4); 161 | assert_uint64(index->min.i64, ==, 10); 162 | assert_uint64(index->max.i64, ==, INT32_MAX); 163 | cx_index_free(index); 164 | 165 | assert_true(cx_column_put_i64(col, 0)); 166 | index = cx_index_new(col); 167 | assert_uint64(index->count, ==, 5); 168 | assert_uint64(index->min.i64, ==, 0); 169 | assert_uint64(index->max.i64, ==, INT32_MAX); 170 | cx_index_free(index); 171 | 172 | assert_true(cx_column_put_i64(col, INT32_MIN)); 173 | index = cx_index_new(col); 174 | assert_uint64(index->count, ==, 6); 175 | assert_uint64(index->min.i64, ==, INT32_MIN); 176 | assert_uint64(index->max.i64, ==, INT32_MAX); 177 | cx_index_free(index); 178 | 179 | assert_true(cx_column_put_i64(col, INT64_MAX)); 180 | index = cx_index_new(col); 181 | assert_uint64(index->count, ==, 7); 182 | assert_uint64(index->min.i64, ==, INT32_MIN); 183 | assert_uint64(index->max.i64, ==, INT64_MAX); 184 | cx_index_free(index); 185 | 186 | assert_true(cx_column_put_i64(col, INT64_MIN)); 187 | index = cx_index_new(col); 188 | assert_uint64(index->count, ==, 8); 189 | assert_uint64(index->min.i64, ==, INT64_MIN); 190 | assert_uint64(index->max.i64, ==, INT64_MAX); 191 | cx_index_free(index); 192 | 193 | cx_column_free(col); 194 | return MUNIT_OK; 195 | } 196 | 197 | static MunitResult test_str_index(const MunitParameter params[], void *fixture) 198 | { 199 | struct cx_column *col = cx_column_new(CX_COLUMN_STR, CX_ENCODING_NONE); 200 | assert_not_null(col); 201 | 202 | struct cx_index *index = cx_index_new(col); 203 | assert_uint64(index->count, ==, 0); 204 | cx_index_free(index); 205 | 206 | assert_true(cx_column_put_str(col, "foo")); 207 | index = cx_index_new(col); 208 | assert_uint64(index->count, ==, 1); 209 | assert_uint64(index->min.len, ==, 3); 210 | assert_uint64(index->max.len, ==, 3); 211 | cx_index_free(index); 212 | 213 | assert_true(cx_column_put_str(col, "foobar")); 214 | index = cx_index_new(col); 215 | assert_uint64(index->count, ==, 2); 216 | assert_uint64(index->min.len, ==, 3); 217 | assert_uint64(index->max.len, ==, 6); 218 | cx_index_free(index); 219 | 220 | assert_true(cx_column_put_str(col, "yeah")); 221 | index = cx_index_new(col); 222 | assert_uint64(index->count, ==, 3); 223 | assert_uint64(index->min.len, ==, 3); 224 | assert_uint64(index->max.len, ==, 6); 225 | cx_index_free(index); 226 | 227 | assert_true(cx_column_put_str(col, "foobarbaz")); 228 | index = cx_index_new(col); 229 | assert_uint64(index->count, ==, 4); 230 | assert_uint64(index->min.len, ==, 3); 231 | assert_uint64(index->max.len, ==, 9); 232 | cx_index_free(index); 233 | 234 | assert_true(cx_column_put_str(col, "")); 235 | index = cx_index_new(col); 236 | assert_uint64(index->count, ==, 5); 237 | assert_uint64(index->min.len, ==, 0); 238 | assert_uint64(index->max.len, ==, 9); 239 | cx_index_free(index); 240 | 241 | cx_column_free(col); 242 | return MUNIT_OK; 243 | } 244 | 245 | MunitTest index_tests[] = { 246 | {"/bit-index", test_bit_index, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 247 | {"/i32-index", test_i32_index, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 248 | {"/i64-index", test_i64_index, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 249 | {"/str-index", test_str_index, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 250 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 251 | -------------------------------------------------------------------------------- /test/main.c: -------------------------------------------------------------------------------- 1 | #include "munit.h" 2 | 3 | extern MunitTest column_tests[]; 4 | extern MunitTest index_tests[]; 5 | extern MunitTest row_group_tests[]; 6 | extern MunitTest match_tests[]; 7 | extern MunitTest predicate_tests[]; 8 | extern MunitTest row_tests[]; 9 | extern MunitTest compress_tests[]; 10 | extern MunitTest file_tests[]; 11 | 12 | MunitSuite suites[] = { 13 | {"/column", column_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 14 | {"/index", index_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 15 | {"/row-group", row_group_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 16 | {"/match", match_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 17 | {"/predicate", predicate_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 18 | {"/row", row_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 19 | {"/compress", compress_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 20 | {"/file", file_tests, NULL, 1, MUNIT_SUITE_OPTION_NONE}, 21 | {NULL, NULL, NULL, 1, MUNIT_SUITE_OPTION_NONE}}; 22 | 23 | static const MunitSuite combined_suite = {"cx", NULL, suites, 1, 24 | MUNIT_SUITE_OPTION_NONE}; 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | return munit_suite_main(&combined_suite, NULL, argc, argv); 29 | } 30 | -------------------------------------------------------------------------------- /test/match.c: -------------------------------------------------------------------------------- 1 | #include "match.h" 2 | 3 | #include "helpers.h" 4 | 5 | #define RAND_MOD 512 6 | #define ITERATIONS 1000 7 | 8 | static int32_t random_i32() 9 | { 10 | return munit_rand_int_range(-RAND_MOD, RAND_MOD); 11 | } 12 | 13 | static float random_flt() 14 | { 15 | return (float)random_i32() / 100.0; 16 | } 17 | 18 | static MunitResult test_i32(const MunitParameter params[], void *fixture) 19 | { 20 | int32_t values[64]; 21 | int32_t cmp = random_i32(); 22 | for (size_t i = 0; i < ITERATIONS; i++) { 23 | uint64_t eq = 0, lt = 0, gt = 0; 24 | for (size_t j = 0; j < 64; j++) { 25 | values[j] = random_i32(); 26 | if (values[j] == cmp) 27 | eq |= (uint64_t)1 << j; 28 | if (values[j] < cmp) 29 | lt |= (uint64_t)1 << j; 30 | if (values[j] > cmp) 31 | gt |= (uint64_t)1 << j; 32 | } 33 | assert_uint64(eq, ==, cx_match_i32_eq(64, values, cmp)); 34 | assert_uint64(lt, ==, cx_match_i32_lt(64, values, cmp)); 35 | assert_uint64(gt, ==, cx_match_i32_gt(64, values, cmp)); 36 | } 37 | return MUNIT_OK; 38 | } 39 | 40 | static MunitResult test_i64(const MunitParameter params[], void *fixture) 41 | { 42 | int64_t values[64]; 43 | int64_t cmp = random_i32(); 44 | for (size_t i = 0; i < ITERATIONS; i++) { 45 | uint64_t eq = 0, lt = 0, gt = 0; 46 | for (size_t j = 0; j < 64; j++) { 47 | values[j] = random_i32(); 48 | if (values[j] == cmp) 49 | eq |= (uint64_t)1 << j; 50 | if (values[j] < cmp) 51 | lt |= (uint64_t)1 << j; 52 | if (values[j] > cmp) 53 | gt |= (uint64_t)1 << j; 54 | } 55 | assert_uint64(eq, ==, cx_match_i64_eq(64, values, cmp)); 56 | assert_uint64(lt, ==, cx_match_i64_lt(64, values, cmp)); 57 | assert_uint64(gt, ==, cx_match_i64_gt(64, values, cmp)); 58 | } 59 | return MUNIT_OK; 60 | } 61 | 62 | static MunitResult test_flt(const MunitParameter params[], void *fixture) 63 | { 64 | float values[64]; 65 | float cmp = random_flt(); 66 | for (size_t i = 0; i < ITERATIONS; i++) { 67 | uint64_t eq = 0, lt = 0, gt = 0; 68 | for (size_t j = 0; j < 64; j++) { 69 | values[j] = random_flt(); 70 | if (values[j] == cmp) 71 | eq |= (uint64_t)1 << j; 72 | if (values[j] < cmp) 73 | lt |= (uint64_t)1 << j; 74 | if (values[j] > cmp) 75 | gt |= (uint64_t)1 << j; 76 | } 77 | assert_uint64(eq, ==, cx_match_flt_eq(64, values, cmp)); 78 | assert_uint64(lt, ==, cx_match_flt_lt(64, values, cmp)); 79 | assert_uint64(gt, ==, cx_match_flt_gt(64, values, cmp)); 80 | } 81 | return MUNIT_OK; 82 | } 83 | 84 | static MunitResult test_dbl(const MunitParameter params[], void *fixture) 85 | { 86 | double values[64]; 87 | double cmp = random_flt(); 88 | for (size_t i = 0; i < ITERATIONS; i++) { 89 | uint64_t eq = 0, lt = 0, gt = 0; 90 | for (size_t j = 0; j < 64; j++) { 91 | values[j] = random_flt(); 92 | if (values[j] == cmp) 93 | eq |= (uint64_t)1 << j; 94 | if (values[j] < cmp) 95 | lt |= (uint64_t)1 << j; 96 | if (values[j] > cmp) 97 | gt |= (uint64_t)1 << j; 98 | } 99 | assert_uint64(eq, ==, cx_match_dbl_eq(64, values, cmp)); 100 | assert_uint64(lt, ==, cx_match_dbl_lt(64, values, cmp)); 101 | assert_uint64(gt, ==, cx_match_dbl_gt(64, values, cmp)); 102 | } 103 | return MUNIT_OK; 104 | } 105 | 106 | static MunitResult test_str(const MunitParameter params[], void *fixture) 107 | { 108 | #define CX_SSE42_PADDING "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" 109 | 110 | #define CX_STR(str) \ 111 | { \ 112 | str CX_SSE42_PADDING, sizeof(str) - 1 \ 113 | } 114 | #define CX_CMP(str) &(struct cx_string)CX_STR(str) 115 | 116 | struct cx_string strings[] = {CX_STR("za"), CX_STR("ab"), CX_STR("ba"), 117 | CX_STR("AB")}; 118 | size_t size = sizeof(strings) / sizeof(*strings); 119 | 120 | assert_uint(cx_match_str_eq(size, strings, CX_CMP("foo"), false), ==, 0); 121 | assert_uint(cx_match_str_eq(size, strings, CX_CMP(""), false), ==, 0); 122 | assert_uint(cx_match_str_eq(size, strings, CX_CMP("ab"), true), ==, 0x2); 123 | assert_uint(cx_match_str_eq(size, strings, CX_CMP("ab"), false), ==, 0xA); 124 | 125 | assert_uint(cx_match_str_gt(size, strings, CX_CMP("x"), false), ==, 0x1); 126 | assert_uint(cx_match_str_lt(size, strings, CX_CMP("x"), false), ==, 0xE); 127 | 128 | assert_uint(cx_match_str_contains(size, strings, CX_CMP(""), false, 129 | CX_STR_LOCATION_START), 130 | ==, 0xF); 131 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), false, 132 | CX_STR_LOCATION_START), 133 | ==, 0xA); 134 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("A"), false, 135 | CX_STR_LOCATION_START), 136 | ==, 0xA); 137 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), true, 138 | CX_STR_LOCATION_START), 139 | ==, 0x2); 140 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("x"), false, 141 | CX_STR_LOCATION_START), 142 | ==, 0); 143 | 144 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("z"), false, 145 | CX_STR_LOCATION_END), 146 | ==, 0); 147 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), true, 148 | CX_STR_LOCATION_END), 149 | ==, 0x5); 150 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), false, 151 | CX_STR_LOCATION_END), 152 | ==, 0x5); 153 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("b"), false, 154 | CX_STR_LOCATION_END), 155 | ==, 0xA); 156 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("b"), true, 157 | CX_STR_LOCATION_END), 158 | ==, 0x2); 159 | 160 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("x"), false, 161 | CX_STR_LOCATION_ANY), 162 | ==, 0); 163 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), true, 164 | CX_STR_LOCATION_ANY), 165 | ==, 0x7); 166 | assert_uint(cx_match_str_contains(size, strings, CX_CMP("a"), false, 167 | CX_STR_LOCATION_ANY), 168 | ==, 0xF); 169 | 170 | return MUNIT_OK; 171 | } 172 | 173 | MunitTest match_tests[] = { 174 | {"/i32", test_i32, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 175 | {"/i64", test_i64, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 176 | {"/flt", test_flt, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 177 | {"/dbl", test_dbl, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 178 | {"/str", test_str, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}, 179 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 180 | -------------------------------------------------------------------------------- /test/row.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "row.h" 4 | 5 | #include "helpers.h" 6 | 7 | #define COLUMN_COUNT 6 8 | #define ROW_COUNT 100 9 | 10 | struct cx_row_fixture { 11 | struct cx_row_group *row_group; 12 | struct cx_column *columns[COLUMN_COUNT]; 13 | struct cx_column *nulls[COLUMN_COUNT]; 14 | struct cx_row_cursor *cursor; 15 | struct cx_predicate *predicate; 16 | }; 17 | 18 | static void *setup(const MunitParameter params[], void *data) 19 | { 20 | struct cx_row_fixture *fixture = malloc(sizeof(*fixture)); 21 | assert_not_null(fixture); 22 | 23 | fixture->row_group = cx_row_group_new(); 24 | assert_not_null(fixture->row_group); 25 | 26 | enum cx_column_type types[] = {CX_COLUMN_I32, CX_COLUMN_I64, CX_COLUMN_BIT, 27 | CX_COLUMN_STR, CX_COLUMN_FLT, CX_COLUMN_DBL}; 28 | 29 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 30 | fixture->columns[i] = cx_column_new(types[i], CX_ENCODING_NONE); 31 | assert_not_null(fixture->columns[i]); 32 | fixture->nulls[i] = cx_column_new(CX_COLUMN_BIT, CX_ENCODING_NONE); 33 | assert_not_null(fixture->nulls[i]); 34 | } 35 | 36 | char buffer[64]; 37 | for (size_t i = 0; i < ROW_COUNT; i++) { 38 | assert_true(cx_column_put_i32(fixture->columns[0], i)); 39 | assert_true(cx_column_put_i64(fixture->columns[1], i * 10)); 40 | assert_true(cx_column_put_bit(fixture->columns[2], i % 3 == 0)); 41 | sprintf(buffer, "cx %zu", i); 42 | assert_true(cx_column_put_str(fixture->columns[3], buffer)); 43 | assert_true(cx_column_put_flt(fixture->columns[4], (float)i / 10)); 44 | assert_true(cx_column_put_dbl(fixture->columns[5], (double)i / 100)); 45 | 46 | assert_true(cx_column_put_bit(fixture->nulls[0], i % 2 == 0)); 47 | assert_true(cx_column_put_bit(fixture->nulls[1], i % 3 == 0)); 48 | assert_true(cx_column_put_bit(fixture->nulls[2], true)); 49 | assert_true(cx_column_put_bit(fixture->nulls[3], false)); 50 | assert_true(cx_column_put_bit(fixture->nulls[4], false)); 51 | assert_true(cx_column_put_bit(fixture->nulls[5], false)); 52 | } 53 | 54 | for (size_t i = 0; i < COLUMN_COUNT; i++) 55 | assert_true(cx_row_group_add_column( 56 | fixture->row_group, fixture->columns[i], fixture->nulls[i])); 57 | 58 | fixture->predicate = cx_predicate_new_true(); 59 | assert_not_null(fixture->predicate); 60 | 61 | fixture->cursor = cx_row_cursor_new(fixture->row_group, fixture->predicate); 62 | assert_not_null(fixture->cursor); 63 | 64 | return fixture; 65 | } 66 | 67 | static void teardown(void *ptr) 68 | { 69 | struct cx_row_fixture *fixture = ptr; 70 | cx_row_cursor_free(fixture->cursor); 71 | cx_predicate_free(fixture->predicate); 72 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 73 | cx_column_free(fixture->columns[i]); 74 | cx_column_free(fixture->nulls[i]); 75 | } 76 | cx_row_group_free(fixture->row_group); 77 | free(fixture); 78 | } 79 | 80 | static void test_cursor_position(struct cx_row_cursor *cursor, size_t expected) 81 | { 82 | cx_value_t value; 83 | 84 | assert_true(cx_row_cursor_get_i32(cursor, 0, &value.i32)); 85 | assert_int32(value.i32, ==, expected); 86 | 87 | assert_true(cx_row_cursor_get_i64(cursor, 1, &value.i64)); 88 | assert_int64(value.i64, ==, expected * 10); 89 | 90 | assert_true(cx_row_cursor_get_bit(cursor, 2, &value.bit)); 91 | if (expected % 3 == 0) 92 | assert_true(value.bit); 93 | else 94 | assert_false(value.bit); 95 | 96 | assert_true(cx_row_cursor_get_null(cursor, 0, &value.bit)); 97 | if (expected % 2 == 0) 98 | assert_true(value.bit); 99 | else 100 | assert_false(value.bit); 101 | assert_true(cx_row_cursor_get_null(cursor, 1, &value.bit)); 102 | if (expected % 3 == 0) 103 | assert_true(value.bit); 104 | else 105 | assert_false(value.bit); 106 | assert_true(cx_row_cursor_get_null(cursor, 2, &value.bit)); 107 | assert_true(value.bit); 108 | assert_true(cx_row_cursor_get_null(cursor, 3, &value.bit)); 109 | assert_false(value.bit); 110 | 111 | assert_true(cx_row_cursor_get_str(cursor, 3, &value.str)); 112 | char buffer[64]; 113 | sprintf(buffer, "cx %zu", expected); 114 | assert_int(value.str.len, ==, strlen(buffer)); 115 | assert_string_equal(buffer, value.str.ptr); 116 | 117 | assert_true(cx_row_cursor_get_flt(cursor, 4, &value.flt)); 118 | assert_float(value.flt, ==, (float)expected / 10); 119 | 120 | assert_true(cx_row_cursor_get_dbl(cursor, 5, &value.dbl)); 121 | assert_double(value.dbl, ==, (double)expected / 100); 122 | } 123 | 124 | static MunitResult test_count(const MunitParameter params[], void *ptr) 125 | { 126 | struct cx_row_fixture *fixture = ptr; 127 | size_t count = cx_row_cursor_count(fixture->cursor); 128 | assert_size(count, ==, ROW_COUNT); 129 | assert_false(cx_row_cursor_error(fixture->cursor)); 130 | return MUNIT_OK; 131 | } 132 | 133 | static MunitResult test_cursor(const MunitParameter params[], void *ptr) 134 | { 135 | struct cx_row_fixture *fixture = ptr; 136 | size_t position = 0; 137 | while (cx_row_cursor_next(fixture->cursor)) 138 | position++; 139 | assert_size(position, ==, ROW_COUNT); 140 | assert_false(cx_row_cursor_error(fixture->cursor)); 141 | for (size_t repeats = 0; repeats < 10; repeats++) { 142 | cx_row_cursor_rewind(fixture->cursor); 143 | for (position = 0; cx_row_cursor_next(fixture->cursor); position++) { 144 | test_cursor_position(fixture->cursor, position); 145 | test_cursor_position(fixture->cursor, position); 146 | } 147 | assert_size(position, ==, ROW_COUNT); 148 | assert_false(cx_row_cursor_error(fixture->cursor)); 149 | } 150 | return MUNIT_OK; 151 | } 152 | 153 | static MunitResult test_cursor_matching(const MunitParameter params[], 154 | void *ptr) 155 | { 156 | struct cx_row_fixture *fixture = ptr; 157 | 158 | struct cx_predicate *predicate = cx_predicate_new_and( 159 | 4, cx_predicate_new_i32_gt(0, 20), cx_predicate_new_i64_lt(1, 900), 160 | cx_predicate_new_bit_eq(2, true), 161 | cx_predicate_new_str_contains(3, "0", false, CX_STR_LOCATION_END)); 162 | assert_not_null(predicate); 163 | 164 | struct cx_row_cursor *cursor = 165 | cx_row_cursor_new(fixture->row_group, predicate); 166 | assert_not_null(cursor); 167 | 168 | size_t position, expected[] = {30, 60}; 169 | 170 | CX_FOREACH(expected, position) 171 | { 172 | assert_true(cx_row_cursor_next(cursor)); 173 | test_cursor_position(cursor, position); 174 | } 175 | assert_false(cx_row_cursor_next(cursor)); 176 | assert_false(cx_row_cursor_error(cursor)); 177 | 178 | cx_row_cursor_free(cursor); 179 | cx_predicate_free(predicate); 180 | 181 | return MUNIT_OK; 182 | } 183 | 184 | static MunitResult test_count_matching(const MunitParameter params[], void *ptr) 185 | { 186 | struct cx_row_fixture *fixture = ptr; 187 | 188 | struct cx_predicate *predicate = cx_predicate_new_and( 189 | 4, cx_predicate_new_i32_gt(0, 20), cx_predicate_new_i64_lt(1, 900), 190 | cx_predicate_new_bit_eq(2, true), 191 | cx_predicate_new_str_contains(3, "0", false, CX_STR_LOCATION_END)); 192 | assert_not_null(predicate); 193 | 194 | struct cx_row_cursor *cursor = 195 | cx_row_cursor_new(fixture->row_group, predicate); 196 | assert_not_null(cursor); 197 | 198 | size_t count = cx_row_cursor_count(cursor); 199 | assert_size(count, ==, 2); 200 | 201 | assert_false(cx_row_cursor_error(cursor)); 202 | 203 | cx_row_cursor_free(cursor); 204 | cx_predicate_free(predicate); 205 | 206 | return MUNIT_OK; 207 | } 208 | 209 | static MunitResult test_empty_row_group(const MunitParameter params[], 210 | void *ptr) 211 | { 212 | struct cx_row_fixture *fixture = ptr; 213 | 214 | struct cx_row_group *row_group = cx_row_group_new(); 215 | assert_not_null(row_group); 216 | struct cx_row_cursor *cursor = 217 | cx_row_cursor_new(row_group, fixture->predicate); 218 | assert_not_null(cursor); 219 | 220 | size_t count = cx_row_cursor_count(cursor); 221 | assert_size(count, ==, 0); 222 | assert_false(cx_row_cursor_error(cursor)); 223 | 224 | cx_row_cursor_rewind(cursor); 225 | assert_false(cx_row_cursor_next(cursor)); 226 | assert_false(cx_row_cursor_error(cursor)); 227 | 228 | cx_row_cursor_free(cursor); 229 | cx_row_group_free(row_group); 230 | return MUNIT_OK; 231 | } 232 | 233 | MunitTest row_tests[] = { 234 | {"/cursor", test_cursor, setup, teardown, MUNIT_TEST_OPTION_NONE, NULL}, 235 | {"/count", test_count, setup, teardown, MUNIT_TEST_OPTION_NONE, NULL}, 236 | {"/cursor-matching", test_cursor_matching, setup, teardown, 237 | MUNIT_TEST_OPTION_NONE, NULL}, 238 | {"/count-matching", test_count_matching, setup, teardown, 239 | MUNIT_TEST_OPTION_NONE, NULL}, 240 | {"/empty-row-group", test_empty_row_group, setup, teardown, 241 | MUNIT_TEST_OPTION_NONE, NULL}, 242 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 243 | -------------------------------------------------------------------------------- /test/row_group.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "row_group.h" 4 | 5 | #include "helpers.h" 6 | 7 | #define COLUMN_COUNT 6 8 | #define ROW_COUNT 1111 9 | 10 | struct cx_row_group_fixture { 11 | struct cx_row_group *row_group; 12 | struct cx_column *columns[COLUMN_COUNT]; 13 | struct cx_column *nulls[COLUMN_COUNT]; 14 | }; 15 | 16 | static void *setup(const MunitParameter params[], void *data) 17 | { 18 | struct cx_row_group_fixture *fixture = malloc(sizeof(*fixture)); 19 | assert_not_null(fixture); 20 | 21 | fixture->row_group = cx_row_group_new(); 22 | assert_not_null(fixture->row_group); 23 | 24 | enum cx_column_type types[] = {CX_COLUMN_I32, CX_COLUMN_I64, CX_COLUMN_BIT, 25 | CX_COLUMN_STR, CX_COLUMN_FLT, CX_COLUMN_DBL}; 26 | 27 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 28 | fixture->columns[i] = cx_column_new(types[i], CX_ENCODING_NONE); 29 | assert_not_null(fixture->columns[i]); 30 | fixture->nulls[i] = cx_column_new(CX_COLUMN_BIT, CX_ENCODING_NONE); 31 | assert_not_null(fixture->nulls[i]); 32 | } 33 | 34 | char buffer[64]; 35 | for (size_t i = 0; i < ROW_COUNT; i++) { 36 | assert_true(cx_column_put_i32(fixture->columns[0], i)); 37 | assert_true(cx_column_put_i64(fixture->columns[1], i * 10)); 38 | assert_true(cx_column_put_bit(fixture->columns[2], i % 3 == 0)); 39 | sprintf(buffer, "cx %zu", i); 40 | assert_true(cx_column_put_str(fixture->columns[3], buffer)); 41 | assert_true(cx_column_put_flt(fixture->columns[4], (float)i / 10)); 42 | assert_true(cx_column_put_dbl(fixture->columns[5], (double)i / 100)); 43 | 44 | assert_true(cx_column_put_bit(fixture->nulls[0], i % 2 == 0)); 45 | assert_true(cx_column_put_bit(fixture->nulls[1], i % 3 == 0)); 46 | assert_true(cx_column_put_bit(fixture->nulls[2], false)); 47 | assert_true(cx_column_put_bit(fixture->nulls[3], true)); 48 | assert_true(cx_column_put_bit(fixture->nulls[4], false)); 49 | assert_true(cx_column_put_bit(fixture->nulls[5], false)); 50 | } 51 | 52 | return fixture; 53 | } 54 | 55 | static void teardown(void *ptr) 56 | { 57 | struct cx_row_group_fixture *fixture = ptr; 58 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 59 | cx_column_free(fixture->columns[i]); 60 | cx_column_free(fixture->nulls[i]); 61 | } 62 | cx_row_group_free(fixture->row_group); 63 | free(fixture); 64 | } 65 | 66 | static MunitResult test_add_column(const MunitParameter params[], void *ptr) 67 | { 68 | struct cx_row_group_fixture *fixture = ptr; 69 | struct cx_row_group *row_group = fixture->row_group; 70 | 71 | assert_size(cx_row_group_column_count(row_group), ==, 0); 72 | assert_size(cx_row_group_row_count(row_group), ==, 0); 73 | 74 | for (size_t i = 0; i < COLUMN_COUNT; i++) 75 | assert_true(cx_row_group_add_column(row_group, fixture->columns[i], 76 | fixture->nulls[i])); 77 | 78 | assert_size(cx_row_group_column_count(row_group), ==, COLUMN_COUNT); 79 | assert_size(cx_row_group_row_count(row_group), ==, ROW_COUNT); 80 | 81 | for (size_t i = 0; i < COLUMN_COUNT; i++) { 82 | struct cx_column *column = fixture->columns[i]; 83 | struct cx_column *nulls = fixture->nulls[i]; 84 | 85 | assert_ptr_equal(cx_row_group_column(row_group, i), column); 86 | assert_int(cx_row_group_column_type(row_group, i), ==, 87 | cx_column_type(column)); 88 | assert_int(cx_row_group_column_encoding(row_group, i), ==, 89 | cx_column_encoding(column)); 90 | 91 | assert_ptr_equal(cx_row_group_nulls(row_group, i), nulls); 92 | 93 | const struct cx_index *index = cx_row_group_column_index(row_group, i); 94 | assert_uint64(index->count, ==, cx_column_count(column)); 95 | 96 | const struct cx_index *nulls_index = 97 | cx_row_group_null_index(row_group, i); 98 | assert_uint64(nulls_index->count, ==, cx_column_count(nulls)); 99 | } 100 | 101 | return MUNIT_OK; 102 | } 103 | 104 | static MunitResult test_row_count_mismatch(const MunitParameter params[], 105 | void *ptr) 106 | { 107 | struct cx_row_group_fixture *fixture = ptr; 108 | struct cx_row_group *row_group = fixture->row_group; 109 | assert_true(cx_row_group_add_column(row_group, fixture->columns[0], 110 | fixture->nulls[0])); 111 | assert_true(cx_column_put_i64(fixture->columns[1], 30)); 112 | assert_false(cx_row_group_add_column(row_group, fixture->columns[1], 113 | fixture->nulls[1])); 114 | return MUNIT_OK; 115 | } 116 | 117 | static MunitResult test_cursor(const MunitParameter params[], void *ptr) 118 | { 119 | struct cx_row_group_fixture *fixture = ptr; 120 | struct cx_row_group *row_group = fixture->row_group; 121 | 122 | for (size_t i = 0; i < COLUMN_COUNT; i++) 123 | assert_true(cx_row_group_add_column(row_group, fixture->columns[i], 124 | fixture->nulls[i])); 125 | 126 | struct cx_row_group_cursor *cursor = cx_row_group_cursor_new(row_group); 127 | assert_not_null(cursor); 128 | 129 | char buffer[64]; 130 | const uint64_t *nulls; 131 | 132 | for (size_t cursor_repeat = 0; cursor_repeat < 2; cursor_repeat++) { 133 | size_t position = 0, count; 134 | for (size_t batch = 0; cx_row_group_cursor_next(cursor); batch++) { 135 | for (size_t batch_repeat = 0; batch_repeat < 2; batch_repeat++) { 136 | const int32_t *i32_batch = 137 | cx_row_group_cursor_batch_i32(cursor, 0, &count); 138 | assert_not_null(i32_batch); 139 | for (size_t i = 0; i < count; i++) 140 | assert_int32(i32_batch[i], ==, position + i); 141 | 142 | nulls = cx_row_group_cursor_batch_nulls(cursor, 0, &count); 143 | assert_not_null(nulls); 144 | for (size_t i = 0; i < count; i++) { 145 | bool bit = *nulls & ((uint64_t)1 << i); 146 | if ((i + position) % 2 == 0) 147 | assert_true(bit); 148 | else 149 | assert_false(bit); 150 | } 151 | } 152 | 153 | if (batch % 3 == 1) { 154 | const int64_t *i64_batch = 155 | cx_row_group_cursor_batch_i64(cursor, 1, &count); 156 | assert_not_null(i64_batch); 157 | for (size_t i = 0; i < count; i++) 158 | assert_int64(i64_batch[i], ==, (position + i) * 10); 159 | 160 | nulls = cx_row_group_cursor_batch_nulls(cursor, 1, &count); 161 | assert_not_null(nulls); 162 | for (size_t i = 0; i < count; i++) { 163 | bool bit = *nulls & ((uint64_t)1 << i); 164 | if ((i + position) % 3 == 0) 165 | assert_true(bit); 166 | else 167 | assert_false(bit); 168 | } 169 | } 170 | 171 | if (batch % 5 == 3) { 172 | const uint64_t *bitset = 173 | cx_row_group_cursor_batch_bit(cursor, 2, &count); 174 | assert_not_null(bitset); 175 | for (size_t i = 0; i < count; i++) { 176 | bool bit = *bitset & ((uint64_t)1 << i); 177 | if ((i + position) % 3 == 0) 178 | assert_true(bit); 179 | else 180 | assert_false(bit); 181 | } 182 | 183 | nulls = cx_row_group_cursor_batch_nulls(cursor, 2, &count); 184 | assert_not_null(nulls); 185 | for (size_t i = 0; i < count; i++) { 186 | bool bit = *nulls & ((uint64_t)1 << i); 187 | assert_false(bit); 188 | } 189 | } 190 | 191 | if (batch % 7 == 5) { 192 | const struct cx_string *str_batch = 193 | cx_row_group_cursor_batch_str(cursor, 3, &count); 194 | assert_not_null(str_batch); 195 | for (size_t i = 0; i < count; i++) { 196 | sprintf(buffer, "cx %zu", position + i); 197 | assert_int(str_batch[i].len, ==, strlen(buffer)); 198 | assert_string_equal(buffer, str_batch[i].ptr); 199 | } 200 | 201 | nulls = cx_row_group_cursor_batch_nulls(cursor, 3, &count); 202 | assert_not_null(nulls); 203 | for (size_t i = 0; i < count; i++) { 204 | bool bit = *nulls & ((uint64_t)1 << i); 205 | assert_true(bit); 206 | } 207 | } 208 | 209 | const float *flt_batch = 210 | cx_row_group_cursor_batch_flt(cursor, 4, &count); 211 | assert_not_null(flt_batch); 212 | for (size_t i = 0; i < count; i++) 213 | assert_float(flt_batch[i], ==, (float)(position + i) / 10); 214 | 215 | const double *dbl_batch = 216 | cx_row_group_cursor_batch_dbl(cursor, 5, &count); 217 | assert_not_null(dbl_batch); 218 | for (size_t i = 0; i < count; i++) 219 | assert_double(dbl_batch[i], ==, (double)(position + i) / 100); 220 | 221 | position += count; 222 | } 223 | 224 | assert_size(position, ==, ROW_COUNT); 225 | 226 | cx_row_group_cursor_rewind(cursor); 227 | } 228 | 229 | cx_row_group_cursor_free(cursor); 230 | return MUNIT_OK; 231 | } 232 | 233 | static MunitResult test_cursor_empty(const MunitParameter params[], void *ptr) 234 | { 235 | struct cx_row_group_fixture *fixture = ptr; 236 | struct cx_row_group *row_group = fixture->row_group; 237 | struct cx_row_group_cursor *cursor = cx_row_group_cursor_new(row_group); 238 | assert_not_null(cursor); 239 | assert_false(cx_row_group_cursor_next(cursor)); 240 | cx_row_group_cursor_free(cursor); 241 | return MUNIT_OK; 242 | } 243 | 244 | MunitTest row_group_tests[] = { 245 | {"/add-column", test_add_column, setup, teardown, MUNIT_TEST_OPTION_NONE, 246 | NULL}, 247 | {"/row-count-mismatch", test_row_count_mismatch, setup, teardown, 248 | MUNIT_TEST_OPTION_NONE, NULL}, 249 | {"/cursor", test_cursor, setup, teardown, MUNIT_TEST_OPTION_NONE, NULL}, 250 | {"/cursor-empty", test_cursor_empty, setup, teardown, 251 | MUNIT_TEST_OPTION_NONE, NULL}, 252 | {NULL, NULL, NULL, NULL, MUNIT_TEST_OPTION_NONE, NULL}}; 253 | -------------------------------------------------------------------------------- /test/temp_file.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | static char *cx_temp_file_new() 6 | { 7 | char tmp_path[] = "/tmp/cx_temp_file.XXXXXX"; 8 | int fd = mkstemp(tmp_path); 9 | if (fd < 0) 10 | return NULL; 11 | close(fd); 12 | char *path = malloc(sizeof(tmp_path)); 13 | if (!path) 14 | goto error; 15 | memcpy(path, tmp_path, sizeof(tmp_path)); 16 | return path; 17 | error: 18 | unlink(tmp_path); 19 | return NULL; 20 | } 21 | 22 | static void cx_temp_file_free(char *path) 23 | { 24 | unlink(path); 25 | free(path); 26 | } 27 | --------------------------------------------------------------------------------