├── .gitignore ├── Dockerfile.alpine ├── Dockerfile.debian ├── Makefile ├── bsv.py ├── experiments ├── bcut │ ├── bcut.c │ ├── bcut.go │ ├── bcut.rs │ └── readme.md ├── cut │ ├── cut.c │ ├── cut.go │ ├── cut.py │ ├── cut.rs │ └── readme.md └── readme.md ├── license.txt ├── readme.md ├── scripts ├── auto_reload.sh ├── install_archlinux.sh ├── makefile.sh ├── update_readme.py └── version.sh ├── setup.py ├── src ├── _bcopy.c ├── _bcopyraw.c ├── _copy.c ├── _csv.c ├── _gen_bsv.c ├── _gen_csv.c ├── _queue.c ├── bcat.c ├── bcombine.c ├── bcounteach.c ├── bcounteach_hash.c ├── bcountrows.c ├── bcut.c ├── bdedupe.c ├── bdedupe_hash.c ├── bdropuntil.c ├── bhead.c ├── blz4.c ├── blz4d.c ├── bmerge.c ├── bpartition.c ├── bquantile_merge.c ├── bquantile_sketch.c ├── bschema.c ├── bsort.c ├── bsplit.c ├── bsum.c ├── bsumeach.c ├── bsumeach_hash.c ├── bsv.c ├── btake.c ├── btakeuntil.c ├── btopn.c ├── bunzip.c ├── bzip.c ├── csv.c └── xxh3.c ├── test ├── _csv_test.py ├── _queue_test.py ├── bcat_test.py ├── bcombine_test.py ├── bcounteach_hash_test.py ├── bcounteach_test.py ├── bcountrows_test.py ├── bcut_test.py ├── bdedupe_hash_test.py ├── bdedupe_test.py ├── bdropuntil_i64_test.py ├── bdropuntil_test.py ├── blz4d_test.py ├── bmerge_test.py ├── bpartition_lz4_test.py ├── bpartition_test.py ├── bquantile_test.py ├── brmerge_test.py ├── brsort_f64_test.py ├── brsort_i64_test.py ├── brsort_test.py ├── brtopn_test.py ├── bschema_test.py ├── bsort_f64_test.py ├── bsort_i64_test.py ├── bsort_test.py ├── bsplit_test.py ├── bsum_test.py ├── bsumeach_f64_test.py ├── bsumeach_hash_i64_test.py ├── bsumeach_hash_test.py ├── bsumeach_i64_test.py ├── bsv_test.py ├── btake_test.py ├── btakeuntil_i64_test.py ├── btakeuntil_test.py ├── btopn_i64_test.py ├── btopn_test.py ├── bunzip_lz4_test.py ├── bunzip_test.py ├── bzip_lz4_test.py ├── bzip_test.py ├── csv_test.py ├── test_util.py └── xxh3_test.py ├── tox.ini ├── util ├── array.h ├── csv.h ├── dump.h ├── load.h ├── map.h ├── queue.h ├── read.h ├── read_ahead.h ├── read_simple.h ├── row.h ├── util.h ├── write.h └── write_simple.h └── vendor ├── argh.h ├── ddsketch.h ├── heap.h ├── lz4.c ├── lz4.h ├── sort.h ├── xxh3.h └── xxhash.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.gcda 2 | Cargo.lock 3 | target/ 4 | .ccls* 5 | .tox/ 6 | .hypothesis/ 7 | __pycache__/ 8 | .cache/ 9 | /todo*/ 10 | bsv 11 | csv 12 | rcut 13 | _csv 14 | _gen_csv 15 | _read 16 | _write 17 | bucket 18 | bdedupe 19 | bin/bcut 20 | bcut_rust 21 | bcut_go 22 | bcut_c 23 | cut_rust 24 | cut_go 25 | cut_c 26 | bbucket 27 | bcounteach 28 | bsort 29 | bdisjoint 30 | bdropuntil 31 | _copy 32 | _gen_csv_c 33 | xxh3 34 | bsplit 35 | bpartition 36 | bcat 37 | btake 38 | btakeuntil 39 | sums 40 | bsum 41 | brsort 42 | bmerge 43 | brmerge 44 | dist/ 45 | build/ 46 | bcountrows 47 | bsv_plain 48 | bcopy 49 | bsv_ascii 50 | csv_ascii 51 | */psv_*/psv 52 | */psv_*/csv 53 | bschema 54 | _gen_bsv 55 | bsumeach 56 | bsumeachu64 57 | bsumu64 58 | bsumeachf64 59 | bunzip 60 | bzip 61 | bpartitionlz4 62 | bcompress 63 | bdecompress 64 | blz4 65 | blz4d 66 | bunziplz4 67 | bziplz4 68 | bcatlz4 69 | bmergelz4 70 | brmergelz4 71 | bcounteachhash 72 | bsumeachhashu64 73 | bcounteach_hash 74 | bmerge_lz4 75 | bpartition_lz4 76 | brmerge_lz4 77 | bsumeach_f64 78 | bsumeach_hash_u64 79 | bsumeach_u64 80 | bsum_u64 81 | bunzip_lz4 82 | bzip_lz4 83 | bcounteach-hash 84 | bmerge-lz4 85 | bpartition-lz4 86 | brmerge-lz4 87 | bsumeach-f64 88 | bsumeach-hash-u64 89 | bsumeach-u64 90 | bsum-u64 91 | bunzip-lz4 92 | bzip-lz4 93 | -copy 94 | -csv 95 | -gen-bsv 96 | -gen-csv 97 | bcat-lz4 98 | bsumeach-hash-f64 99 | bsort-f64 100 | brsort-f64 101 | bsort-u64 102 | bsort-i64 103 | bsumeach-hash-i64 104 | bsumeach-i64 105 | bsum-i64 106 | brsort-i64 107 | bcut 108 | bsumeach-hash 109 | bcombine 110 | version.h 111 | bhead 112 | btail 113 | bdedupe-hash 114 | btopn 115 | bquantile-merge 116 | bquantile-sketch 117 | _bcopy 118 | _bcopyraw 119 | _queue 120 | -------------------------------------------------------------------------------- /Dockerfile.alpine: -------------------------------------------------------------------------------- 1 | FROM alpine:edge 2 | 3 | RUN apk update && apk upgrade && apk add \ 4 | python3-dev \ 5 | py3-numpy \ 6 | coreutils \ 7 | make \ 8 | bash \ 9 | curl \ 10 | git \ 11 | rsync \ 12 | musl-dev \ 13 | gcc 14 | 15 | RUN python3 -m ensurepip 16 | 17 | RUN python3 -m pip install \ 18 | git+https://github.com/nathants/py-util \ 19 | git+https://github.com/nathants/py-shell \ 20 | git+https://github.com/nathants/py-pool \ 21 | git+https://github.com/nathants/cffi-xxh3 \ 22 | hypothesis \ 23 | pytest \ 24 | pytest-timeout \ 25 | pytest-xdist 26 | -------------------------------------------------------------------------------- /Dockerfile.debian: -------------------------------------------------------------------------------- 1 | FROM debian:testing 2 | 3 | RUN apt update && apt upgrade -y && apt install -y \ 4 | python3-pip \ 5 | python3-numpy \ 6 | make \ 7 | git \ 8 | rsync \ 9 | build-essential \ 10 | gcc 11 | 12 | RUN python3 -m pip install \ 13 | git+https://github.com/nathants/py-util \ 14 | git+https://github.com/nathants/py-shell \ 15 | git+https://github.com/nathants/py-pool \ 16 | git+https://github.com/nathants/cffi-xxh3 \ 17 | hypothesis \ 18 | pytest \ 19 | pytest-timeout \ 20 | pytest-xdist 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all clean test 2 | CFLAGS=${CC_EXTRA} -Wno-int-conversion -Wno-incompatible-pointer-types -Wno-discarded-qualifiers -Iutil -Ivendor -flto -O3 -march=native -mtune=native -lm 3 | ALL=clean docs _bcopy _bcopyraw _copy _csv _gen_bsv _gen_csv _queue bcat bcombine bcounteach bcounteach-hash bcountrows bcut bdedupe bdedupe-hash bdropuntil bhead blz4 blz4d bmerge bpartition bquantile-merge bquantile-sketch bschema bsort bsplit bsum bsumeach bsumeach-hash bsv btake btakeuntil btopn bunzip bzip csv xxh3 4 | 5 | all: $(ALL) 6 | 7 | setup: 8 | mkdir -p bin 9 | ./scripts/version.sh &>/dev/null 10 | 11 | clean: setup 12 | cd bin && rm -f -- * *.* 13 | 14 | docs: 15 | ./scripts/update_readme.py 16 | 17 | test: setup 18 | tox 19 | 20 | _bcopy: setup 21 | gcc vendor/lz4.c src/_bcopy.c -o bin/_bcopy $(CFLAGS) 22 | 23 | _bcopyraw: setup 24 | gcc vendor/lz4.c src/_bcopyraw.c -o bin/_bcopyraw $(CFLAGS) 25 | 26 | _copy: setup 27 | gcc vendor/lz4.c src/_copy.c -o bin/_copy $(CFLAGS) 28 | 29 | _csv: setup 30 | gcc vendor/lz4.c src/_csv.c -o bin/_csv $(CFLAGS) 31 | 32 | _gen_bsv: setup 33 | gcc vendor/lz4.c src/_gen_bsv.c -o bin/_gen_bsv $(CFLAGS) 34 | 35 | _gen_csv: setup 36 | gcc vendor/lz4.c src/_gen_csv.c -o bin/_gen_csv $(CFLAGS) 37 | 38 | _queue: setup 39 | gcc vendor/lz4.c src/_queue.c -o bin/_queue $(CFLAGS) 40 | 41 | bcat: setup 42 | gcc vendor/lz4.c src/bcat.c -o bin/bcat $(CFLAGS) 43 | 44 | bcombine: setup 45 | gcc vendor/lz4.c src/bcombine.c -o bin/bcombine $(CFLAGS) 46 | 47 | bcounteach: setup 48 | gcc vendor/lz4.c src/bcounteach.c -o bin/bcounteach $(CFLAGS) 49 | 50 | bcounteach-hash: setup 51 | gcc vendor/lz4.c src/bcounteach_hash.c -o bin/bcounteach-hash $(CFLAGS) 52 | 53 | bcountrows: setup 54 | gcc vendor/lz4.c src/bcountrows.c -o bin/bcountrows $(CFLAGS) 55 | 56 | bcut: setup 57 | gcc vendor/lz4.c src/bcut.c -o bin/bcut $(CFLAGS) 58 | 59 | bdedupe: setup 60 | gcc vendor/lz4.c src/bdedupe.c -o bin/bdedupe $(CFLAGS) 61 | 62 | bdedupe-hash: setup 63 | gcc vendor/lz4.c src/bdedupe_hash.c -o bin/bdedupe-hash $(CFLAGS) 64 | 65 | bdropuntil: setup 66 | gcc vendor/lz4.c src/bdropuntil.c -o bin/bdropuntil $(CFLAGS) 67 | 68 | bhead: setup 69 | gcc vendor/lz4.c src/bhead.c -o bin/bhead $(CFLAGS) 70 | 71 | blz4: setup 72 | gcc vendor/lz4.c src/blz4.c -o bin/blz4 $(CFLAGS) 73 | 74 | blz4d: setup 75 | gcc vendor/lz4.c src/blz4d.c -o bin/blz4d $(CFLAGS) 76 | 77 | bmerge: setup 78 | gcc vendor/lz4.c src/bmerge.c -o bin/bmerge $(CFLAGS) 79 | 80 | bpartition: setup 81 | gcc vendor/lz4.c src/bpartition.c -o bin/bpartition $(CFLAGS) 82 | 83 | bquantile-merge: setup 84 | gcc vendor/lz4.c src/bquantile_merge.c -o bin/bquantile-merge $(CFLAGS) 85 | 86 | bquantile-sketch: setup 87 | gcc vendor/lz4.c src/bquantile_sketch.c -o bin/bquantile-sketch $(CFLAGS) 88 | 89 | bschema: setup 90 | gcc vendor/lz4.c src/bschema.c -o bin/bschema $(CFLAGS) 91 | 92 | bsort: setup 93 | gcc vendor/lz4.c src/bsort.c -o bin/bsort $(CFLAGS) 94 | 95 | bsplit: setup 96 | gcc vendor/lz4.c src/bsplit.c -o bin/bsplit $(CFLAGS) 97 | 98 | bsum: setup 99 | gcc vendor/lz4.c src/bsum.c -o bin/bsum $(CFLAGS) 100 | 101 | bsumeach: setup 102 | gcc vendor/lz4.c src/bsumeach.c -o bin/bsumeach $(CFLAGS) 103 | 104 | bsumeach-hash: setup 105 | gcc vendor/lz4.c src/bsumeach_hash.c -o bin/bsumeach-hash $(CFLAGS) 106 | 107 | bsv: setup 108 | gcc vendor/lz4.c src/bsv.c -o bin/bsv $(CFLAGS) 109 | 110 | btake: setup 111 | gcc vendor/lz4.c src/btake.c -o bin/btake $(CFLAGS) 112 | 113 | btakeuntil: setup 114 | gcc vendor/lz4.c src/btakeuntil.c -o bin/btakeuntil $(CFLAGS) 115 | 116 | btopn: setup 117 | gcc vendor/lz4.c src/btopn.c -o bin/btopn $(CFLAGS) 118 | 119 | bunzip: setup 120 | gcc vendor/lz4.c src/bunzip.c -o bin/bunzip $(CFLAGS) 121 | 122 | bzip: setup 123 | gcc vendor/lz4.c src/bzip.c -o bin/bzip $(CFLAGS) 124 | 125 | csv: setup 126 | gcc vendor/lz4.c src/csv.c -o bin/csv $(CFLAGS) 127 | 128 | xxh3: setup 129 | gcc vendor/lz4.c src/xxh3.c -o bin/xxh3 $(CFLAGS) 130 | 131 | -------------------------------------------------------------------------------- /bsv.py: -------------------------------------------------------------------------------- 1 | from typing import Generator, Sequence, IO 2 | import struct 3 | import io 4 | 5 | u16 = 'H' 6 | i32 = 'i' 7 | sizeof_i32 = 4 8 | sizeof_u16 = 2 9 | buffer_size = 1024 * 1024 * 5 10 | 11 | def load(f: IO[bytes]) -> Generator[Sequence[bytes], None, None]: 12 | # read chunk header to get size of chunk 13 | while True: 14 | data = f.read(sizeof_i32) 15 | if len(data) == 0: 16 | break 17 | elif len(data) == sizeof_i32: 18 | # read chunk 19 | chunk_size = struct.unpack(i32, data)[0] 20 | buffer = f.read(chunk_size) 21 | assert len(buffer) == chunk_size, [len(buffer), chunk_size] 22 | # buffer = io.BytesIO(buffer) 23 | offset = 0 24 | while True: 25 | # maybe read max index 26 | data = buffer[offset:offset + sizeof_u16] 27 | offset += sizeof_u16 28 | assert len(data) in {0, sizeof_u16} 29 | if len(data) != sizeof_u16: 30 | break 31 | max = struct.unpack(u16, data)[0] 32 | # read sizes 33 | size = (max + 1) * sizeof_u16 34 | data = buffer[offset:offset + size] 35 | offset += size 36 | assert len(data) == size 37 | sizes = [struct.unpack(u16, data[i * sizeof_u16:i * sizeof_u16 + sizeof_u16])[0] for i in range(size // sizeof_u16)] 38 | # read value bytes 39 | vals = [] 40 | for size in sizes: 41 | data = buffer[offset:offset + size] 42 | offset += size 43 | assert len(data) == size 44 | assert buffer[offset:offset + 1] == b'\0' 45 | offset += 1 46 | vals.append(data) 47 | yield vals 48 | else: 49 | assert False 50 | 51 | def dump(f: IO[bytes], xss: Sequence[Sequence[bytes]]) -> None: 52 | buffer = io.BytesIO() 53 | for xs in xss: 54 | # write max index 55 | assert sizeof_u16 == buffer.write(struct.pack(u16, len(xs) - 1)) 56 | # write sizes 57 | for x in xs: 58 | assert sizeof_u16 == buffer.write(struct.pack(u16, len(x))) 59 | # write vals 60 | for x in xs: 61 | assert len(x) == buffer.write(x) 62 | assert 1 == buffer.write(b'\0') 63 | assert sizeof_i32 == f.write(struct.pack(i32, len(buffer.getvalue()))) 64 | assert len(buffer.getvalue()) < buffer_size, f'you cant dump more than {buffer_size} bytes at a time' 65 | assert len(buffer.getvalue()) == f.write(buffer.getvalue()) 66 | -------------------------------------------------------------------------------- /experiments/bcut/bcut.c: -------------------------------------------------------------------------------- 1 | #include "load.h" 2 | #include "write_simple.h" 3 | 4 | int main(int argc, const char **argv) { 5 | 6 | // setup bsv 7 | SIGPIPE_HANDLER(); 8 | INVARIANTS(); 9 | INCREASE_PIPE_SIZES(); 10 | 11 | // setup input 12 | FILE *in_files[1] = {stdin}; 13 | readbuf_t rbuf; 14 | rbuf_init(&rbuf, in_files, 1, false); 15 | 16 | // setup output 17 | FILE *out_files[1] = {stdout}; 18 | writebuf_t wbuf; 19 | wbuf_init(&wbuf, out_files, 1); 20 | 21 | // setup state 22 | row_t row; 23 | char *f; 24 | char *fs; 25 | i32 field; 26 | i32 num_fields=0; 27 | i32 field_nums[MAX_COLUMNS]; 28 | 29 | // parse args 30 | fs = (char*)argv[1]; 31 | while ((f = strsep(&fs, ","))) { 32 | field = atoi(f); 33 | field_nums[num_fields++] = field - 1; 34 | ASSERT(field <= MAX_COLUMNS, "fatal: cannot select fields above %d, tried to select: %d\n", MAX_COLUMNS, field); 35 | ASSERT(field >= 1, "fatal: fields must be positive, got: %d", field); 36 | ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d fields\n", MAX_COLUMNS); 37 | } 38 | 39 | // process input row by row 40 | while (1) { 41 | load_next(&rbuf, &row, 0); 42 | if (row.stop) 43 | break; 44 | 45 | i32 j = 0; 46 | for (i32 i = 0; i < num_fields; i++) { 47 | field = field_nums[i]; 48 | write_bytes(&wbuf, row.columns[field], row.sizes[field], 0); 49 | if (++j < num_fields) { 50 | write_bytes(&wbuf, ",", 1, 0); 51 | } 52 | } 53 | write_bytes(&wbuf, "\n", 1, 0); 54 | 55 | } 56 | write_flush(&wbuf, 0); 57 | } 58 | -------------------------------------------------------------------------------- /experiments/bcut/bcut.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "encoding/binary" 6 | "io" 7 | "os" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | func main() { 13 | var fields []int 14 | for _, x := range strings.Split(os.Args[1], ",") { 15 | x, err := strconv.Atoi(x) 16 | if err != nil { 17 | panic(err) 18 | } 19 | fields = append(fields, x-1) 20 | } 21 | max := int32(0) 22 | sizes := make([]int32, 1<<16) 23 | offsets := make([]int32, 1<<16) 24 | r := bufio.NewReader(os.Stdin) 25 | w := bufio.NewWriter(os.Stdout) 26 | defer w.Flush() 27 | // buffer4 := make([]byte, 4) 28 | chunk_offset := int32(0) 29 | chunk_size := int32(0) 30 | chunk_buffer := make([]byte, 1024*1024*5) 31 | for { 32 | 33 | // read chunk size 34 | err := binary.Read(r, binary.LittleEndian, &chunk_size) 35 | if err != nil { 36 | break 37 | } 38 | 39 | // read chunk 40 | _, err = io.ReadFull(r, chunk_buffer[:chunk_size]) 41 | if err != nil { 42 | panic(err) 43 | } 44 | 45 | // read all rows in chunk 46 | chunk_offset = 0 47 | for chunk_offset < chunk_size { 48 | // read row max 49 | max = int32(binary.LittleEndian.Uint16(chunk_buffer[chunk_offset:])) 50 | chunk_offset += 2 51 | 52 | // read row sizes 53 | for i := int32(0); i <= max; i++ { 54 | sizes[i] = int32(binary.LittleEndian.Uint16(chunk_buffer[chunk_offset:])) 55 | chunk_offset += 2 56 | } 57 | 58 | // setup row offsets 59 | for i := int32(0); i <= max; i++ { 60 | offsets[i] = chunk_offset 61 | chunk_offset += sizes[i] + 1 62 | } 63 | 64 | // handle row 65 | for i, f := range fields { 66 | _, err = w.Write(chunk_buffer[offsets[f] : offsets[f]+sizes[f]]) 67 | if err != nil { 68 | panic(err) 69 | } 70 | if i != len(fields)-1 { 71 | _, err = w.Write([]byte(",")) 72 | if err != nil { 73 | panic(err) 74 | } 75 | } 76 | } 77 | _, err = w.Write([]byte("\n")) 78 | if err != nil { 79 | panic(err) 80 | } 81 | } 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /experiments/bcut/bcut.rs: -------------------------------------------------------------------------------- 1 | use std::io::{stdin, stdout, BufReader, BufWriter, Write, Read}; 2 | use std::env::args; 3 | 4 | const MAX_COLUMNS: usize = 1 << 16; 5 | 6 | #[inline] 7 | fn read_i32(buf: &[u8]) -> usize { 8 | let mut out: usize = 0; 9 | let ptr_out = &mut out as *mut usize as *mut u8; 10 | unsafe { 11 | std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr_out, 4); 12 | } 13 | out 14 | } 15 | 16 | 17 | #[inline] 18 | fn read_u16(buf: &[u8]) -> usize { 19 | let mut out: usize = 0; 20 | let ptr_out = &mut out as *mut usize as *mut u8; 21 | unsafe { 22 | std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr_out, 2); 23 | } 24 | out 25 | } 26 | 27 | fn main() { 28 | 29 | // parse args 30 | let fields: Vec = args().collect(); 31 | assert!(fields.len() == 2, "usage: bcut field1,field2,fieldN,..."); 32 | let fields: Vec = fields[1] 33 | .split(",") 34 | .map(|x| x.parse::().unwrap()) 35 | .map(|x| { assert!(x > 0 && x < MAX_COLUMNS as i32); (x - 1) as usize}) 36 | .collect(); 37 | 38 | // setup io 39 | let mut reader = BufReader::with_capacity(1024 * 512, stdin()); 40 | let mut writer = BufWriter::with_capacity(1024 * 512, stdout()); 41 | let mut buffer4: [u8; 4] = [0; 4]; 42 | let mut chunk_offset: usize; 43 | let mut chunk_buffer: [u8; 1024*1024*5] = [0; 1024*1024*5]; 44 | 45 | // setup state 46 | let mut max: usize; 47 | let mut sizes: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS]; 48 | let mut offsets: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS]; 49 | 50 | // process input line by line 51 | while let Ok(_) = reader.read_exact(&mut buffer4) { 52 | // read chunk size 53 | let chunk_size = read_i32(&buffer4); 54 | 55 | // read next chunk 56 | let mut chunk_buffer = &mut chunk_buffer[..chunk_size]; 57 | reader.read_exact(&mut chunk_buffer).unwrap(); 58 | 59 | // read all rows in chunk 60 | chunk_offset = 0; 61 | while chunk_offset < chunk_size { 62 | 63 | // read row max 64 | max = read_u16(&chunk_buffer[chunk_offset..]); 65 | chunk_offset += 2; 66 | 67 | // read row sizes 68 | for i in 0..max+1 { 69 | sizes[i] = read_u16(&chunk_buffer[chunk_offset..]); 70 | chunk_offset += 2; 71 | } 72 | 73 | // setup row offsets 74 | for i in 0..max+1 { 75 | offsets[i] = chunk_offset; 76 | chunk_offset += sizes[i] + 1; 77 | } 78 | 79 | // handle row 80 | let mut i = 0; 81 | for field in &fields { 82 | assert!(*field <= max, "found a row without enough columns"); 83 | let offset = offsets[*field]; 84 | let size = sizes[*field]; 85 | writer.write_all(&chunk_buffer[offset..offset+size]).unwrap(); 86 | i += 1; 87 | if i < fields.len() { 88 | writer.write_all(&[b',']).unwrap(); 89 | } 90 | } 91 | writer.write_all(&[b'\n']).unwrap(); 92 | 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /experiments/bcut/readme.md: -------------------------------------------------------------------------------- 1 | ### experiments with alternate implementations of bcut 2 | 3 | ##### ramfs 4 | ```bash 5 | cd /tmp 6 | ``` 7 | 8 | ##### build bsv and put bin on PATH 9 | ```bash 10 | >> (cd ~/repos/bsv && make) 11 | >> export PATH=$PATH:~/repos/bsv/bin 12 | ``` 13 | 14 | ##### increase max pipe size to 5MB 15 | ```bash 16 | >> sudo sysctl fs.pipe-max-size=5242880 17 | ``` 18 | 19 | ##### make sure we are dealing with bytes only 20 | ```bash 21 | >> export LC_ALL=C 22 | ``` 23 | 24 | ##### make some csv 25 | ```bash 26 | >> time _gen_csv 8 25000000 >data.csv 27 | real 0m7.360s 28 | user 0m6.677s 29 | sys 0m0.680s 30 | ``` 31 | 32 | ##### convert it to bsv 33 | ```bash 34 | >> bsv data.bsv 35 | >> time bsv /dev/null 36 | real 0m5.115s 37 | user 0m4.893s 38 | sys 0m0.220s 39 | ``` 40 | 41 | ##### see how well the data compresses 42 | ```bash 43 | >> time lz4 data.csv.lz4 44 | real 0m5.135s 45 | user 0m4.782s 46 | sys 0m0.349s 47 | 48 | >> time lz4 data.bsv.lz4 49 | real 0m6.876s 50 | user 0m6.374s 51 | sys 0m0.500s 52 | ``` 53 | 54 | ##### check the sizes, bsv trades space for time 55 | ```bash 56 | >> ls -lh data.* | cut -d' ' -f5,9 57 | 2.2G data.bsv 58 | 1.1G data.bsv.lz4 59 | 1.8G data.csv 60 | 779M data.csv.lz4 61 | ``` 62 | 63 | ##### copy the experiments and make sure they all get the same result 64 | ```bash 65 | >> cp ~/repos/bsv/experiments/bcut/* . 66 | >> cp -r ~/repos/bsv/util . 67 | >> cp -r ~/repos/bsv/vendor . 68 | 69 | >> cut -d, -f3,7 > go build -o bcut_go bcut.go 73 | >> ./bcut_go 3,7 > rustc -O -o bcut_rust bcut.rs 77 | >> ./bcut_rust 3,7 > gcc -Ivendor -Iutil -O3 -flto -march=native -mtune=native -o bcut_c bcut.c 81 | >> ./bcut_c 3,7 > bcut 3,7 > time cut -d, -f3,7 /dev/null 91 | real 0m5.784s 92 | user 0m5.472s 93 | sys 0m0.311s 94 | ``` 95 | 96 | ##### go is faster 97 | ```bash 98 | >> time ./bcut_go 3,7 /dev/null 99 | real 0m2.179s 100 | user 0m1.870s 101 | sys 0m0.312s 102 | ``` 103 | 104 | ##### rust is faster 105 | ```bash 106 | >> time ./bcut_rust 3,7 /dev/null 107 | real 0m1.343s 108 | user 0m1.139s 109 | sys 0m0.203s 110 | ``` 111 | 112 | ##### c is faster 113 | ```bash 114 | >> time ./bcut_c 3,7 /dev/null 115 | real 0m0.812s 116 | user 0m0.622s 117 | sys 0m0.189s 118 | ``` 119 | -------------------------------------------------------------------------------- /experiments/cut/cut.c: -------------------------------------------------------------------------------- 1 | #include "csv.h" 2 | #include "write_simple.h" 3 | 4 | int main(int argc, char **argv) { 5 | // setup io 6 | CSV_INIT(); 7 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1); 8 | // setup state 9 | char *f; 10 | char *fs; 11 | int field; 12 | int num_fields=0; 13 | int field_nums[MAX_COLUMNS]; 14 | // parse args 15 | fs = (char*)argv[1]; 16 | while ((f = strsep(&fs, ","))) { 17 | field = atoi(f); 18 | field_nums[num_fields++] = field - 1; 19 | ASSERT(field <= MAX_COLUMNS, "fatal: cannot select fields above %d, tried to select: %d\n", MAX_COLUMNS, field); 20 | ASSERT(field >= 1, "fatal: fields must be positive, got: %d", field); 21 | ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d fields\n", MAX_COLUMNS); 22 | } 23 | // process input row by row 24 | while (1) { 25 | CSV_READ_LINE(stdin); 26 | if (csv_stop) 27 | break; 28 | if (csv_max || csv_sizes[0]) { 29 | int j = 0; 30 | for (int i = 0; i < num_fields; i++) { 31 | ASSERT(field_nums[i] <= csv_max, "fatal: not enough columns\n"); 32 | field = field_nums[i]; 33 | write_bytes(&wbuf, csv_columns[field], csv_sizes[field], 0); 34 | if (++j < num_fields) 35 | write_bytes(&wbuf, ",", 1, 0); 36 | } 37 | write_bytes(&wbuf, "\n", 1, 0); 38 | } 39 | } 40 | write_flush(&wbuf, 0); 41 | } 42 | -------------------------------------------------------------------------------- /experiments/cut/cut.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "os" 7 | "strconv" 8 | "strings" 9 | ) 10 | 11 | func main() { 12 | var fields []int 13 | for _, x := range strings.Split(os.Args[1], ",") { 14 | x, err := strconv.Atoi(x) 15 | if err != nil { 16 | panic(err) 17 | } 18 | fields = append(fields, x-1) 19 | } 20 | starts := make([]int, 1<<16) 21 | ends := make([]int, 1<<16) 22 | r := bufio.NewReader(os.Stdin) 23 | w := bufio.NewWriter(os.Stdout) 24 | defer w.Flush() 25 | for { 26 | // read row 27 | line, isPrefix, err := r.ReadLine() 28 | if isPrefix { 29 | panic("line too long") 30 | } 31 | if err != nil { 32 | if err == io.EOF || err == io.ErrUnexpectedEOF { 33 | break 34 | } 35 | panic(err) 36 | } 37 | // parse row 38 | offset := 0 39 | max := 0 40 | for i := 0; i < len(line); i++ { 41 | switch line[i] { 42 | case byte(','): 43 | starts[max] = offset 44 | ends[max] = i 45 | offset = i + 1 46 | max += 1 47 | } 48 | } 49 | starts[max] = offset 50 | ends[max] = len(line) 51 | // handle row 52 | for i, f := range fields { 53 | _, err = w.Write(line[starts[f]:ends[f]]) 54 | if err != nil { 55 | panic(err) 56 | } 57 | if i != len(fields)-1 { 58 | _, err = w.Write([]byte(",")) 59 | if err != nil { 60 | panic(err) 61 | } 62 | } 63 | } 64 | _, err = w.Write([]byte("\n")) 65 | if err != nil { 66 | panic(err) 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /experiments/cut/cut.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | fields = [int(x) - 1 for x in sys.argv[1].split(',')] 4 | 5 | buffer_size = 1024 * 512 6 | 7 | # row metadata 8 | starts = [0 for _ in range(1 << 16)] # type: ignore 9 | ends = [0 for _ in range(1 << 16)] # type: ignore 10 | 11 | # delimiters 12 | comma = bytearray(b',')[0] 13 | newline = bytearray(b'\n')[0] 14 | 15 | # write read_buffer 16 | write_buffer = bytearray(buffer_size) 17 | 18 | while True: 19 | # read read_buffer size 20 | read_buffer = sys.stdin.buffer.read(buffer_size) # type: ignore 21 | stop = len(read_buffer) != buffer_size 22 | # on a full read, extend with the next full line so the read_buffer always ends with a newline 23 | if len(read_buffer) == buffer_size: 24 | read_buffer += sys.stdin.buffer.readline() 25 | read_offset = 0 26 | write_offset = 0 27 | max = 0 28 | # process read_buffer byte by byte 29 | for i in range(len(read_buffer)): 30 | # found the next column 31 | if read_buffer[i] == comma: 32 | starts[max] = read_offset 33 | ends[max] = i 34 | read_offset = i + 1 35 | max += 1 36 | # found the row end 37 | elif read_buffer[i] == newline: 38 | starts[max] = read_offset 39 | ends[max] = i 40 | read_offset = i + 1 # next row starts on the byte following the newline 41 | # handle row 42 | val = b'' 43 | for i, f in enumerate(fields): 44 | val += read_buffer[starts[f]:ends[f]] 45 | if i != len(fields) - 1: 46 | val += b',' 47 | val += b'\n' 48 | # maybe flush and write 49 | if len(val) > len(write_buffer) - write_offset: 50 | sys.stdout.buffer.write(write_buffer[:write_offset]) 51 | write_offset = 0 52 | write_buffer[write_offset:write_offset + len(val)] = val 53 | write_offset += len(val) 54 | # reset for next row 55 | max = 0 56 | # flush 57 | sys.stdout.buffer.write(write_buffer[:write_offset]) 58 | if stop: 59 | break 60 | -------------------------------------------------------------------------------- /experiments/cut/cut.rs: -------------------------------------------------------------------------------- 1 | use std::io::{stdin, stdout, BufReader, BufWriter, Write, BufRead}; 2 | use std::env::args; 3 | 4 | const MAX_COLUMNS: usize = 1 << 16; 5 | 6 | fn main() { 7 | // parse args 8 | let fields: Vec = args().collect(); 9 | assert!(fields.len() == 2, "usage: bcut field1,field2,fieldN,..."); 10 | let fields: Vec = fields[1] 11 | .split(",") 12 | .map(|x| x.parse::().unwrap()) 13 | .map(|x| { assert!(x > 0 && x < MAX_COLUMNS as i32); (x - 1) as usize}) 14 | .collect(); 15 | // setup io 16 | let mut reader = BufReader::with_capacity(1024 * 512, stdin()); 17 | let mut writer = BufWriter::with_capacity(1024 * 512, stdout()); 18 | let mut buffer: Vec = Vec::new(); 19 | // setup state 20 | let mut offsets: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS]; 21 | let mut lens: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS]; 22 | // process input line by line 23 | loop { 24 | // read the next line into the buffer 25 | buffer.clear(); 26 | match reader.read_until(b'\n', &mut buffer) { 27 | Err(err) => std::panic!(err), 28 | Ok(0) => break, 29 | // process the current line 30 | Ok(mut n) => { 31 | if buffer[n - 1] == b'\n' { 32 | n -= 1; 33 | } 34 | if n > 0 { 35 | // discover the fields of this row 36 | let mut max = 0; 37 | let mut offset = 0; 38 | for (i, part) in buffer[..n].split(|val| val == &b',').enumerate() { 39 | offsets[i] = offset; 40 | lens[i] = part.len(); 41 | offset += part.len() + 1; 42 | max = i; 43 | } 44 | // output the chosen fields 45 | let mut i = 0; 46 | for field in &fields { 47 | assert!(*field <= max, "found a row without enough columns"); 48 | let offset = offsets[*field]; 49 | let len = lens[*field]; 50 | writer.write_all(&buffer[..n][offset..offset+len]).unwrap(); 51 | i += 1; 52 | if i < fields.len() { 53 | writer.write_all(&[b',']).unwrap(); 54 | } 55 | } 56 | writer.write_all(&[b'\n']).unwrap(); 57 | } 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /experiments/readme.md: -------------------------------------------------------------------------------- 1 | ### tldr; 2 | 3 | ```bash 4 | >> export LC_ALL=C 5 | >> time cut -d, -f3,7 /dev/null 6 | real 0m5.784s 7 | user 0m5.472s 8 | sys 0m0.311s 9 | ``` 10 | 11 | ```bash 12 | >> time bcut 3,7 /dev/null 13 | real 0m1.010s 14 | user 0m0.729s 15 | sys 0m0.280s 16 | ``` 17 | 18 | ```bash 19 | >> time sort --parallel=1 -S50% -k1,1 /dev/null 20 | real 0m22.406s 21 | user 0m21.516s 22 | sys 0m0.880s 23 | ``` 24 | 25 | ```bash 26 | >> time bsort /dev/null 27 | real 0m13.558s 28 | user 0m12.266s 29 | sys 0m1.139s 30 | ``` 31 | 32 | ```bash 33 | >> time sort -m -k1,1 -S50% csv.*.sorted >/dev/null 34 | real 0m8.846s 35 | user 0m6.692s 36 | sys 0m2.149s 37 | ``` 38 | 39 | ```bash 40 | >> time bmerge $(cat filenames.txt | while read path; do echo $path.sorted; done) >/dev/null 41 | real 0m1.361s 42 | user 0m0.911s 43 | sys 0m0.450s 44 | ``` 45 | 46 | ### alternate implementations and performance experiments 47 | 48 | [cut](https://github.com/nathants/bsv/blob/master/experiments/cut/) in c, rust, go, and pypy 49 | 50 | [bcut](https://github.com/nathants/bsv/blob/master/experiments/bcut/) in c, rust, go and pypy 51 | 52 | [sort and merge](https://github.com/nathants/bsv/blob/master/experiments/cut/readme.md#the-only-random-access-that-should-ever-be-happening-is-sort) with bsv and coreutils 53 | 54 | [linear scan](https://github.com/nathants/bsv/blob/master/experiments/cut/readme.md#if-you-have-sorted-data-you-can-drop-rows-before-a-given-value-efficiently) with bsv and grep 55 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018-present Nathan Todd-Stone 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/auto_reload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eou pipefail 3 | 4 | name=$1 5 | 6 | if ! which aws-ec2-new &>/dev/null; then 7 | echo fatal: need to install https://github.com/nathants/cli-aws 8 | exit 1 9 | fi 10 | 11 | cd $(dirname $(dirname $0)) 12 | 13 | # push code 14 | aws-ec2-rsync . :bsv/ $name -y 15 | 16 | # reinstall bsv 17 | aws-ec2-ssh $name -yc " 18 | cd ~/bsv 19 | make -j && sudo mv -fv bin/* /usr/local/bin 20 | " 21 | 22 | # kill any running reloaders 23 | aws-ec2-ssh $name -yc "(ps -ef | grep entr | grep make | grep -v grep | awk '{print \$2}' | xargs kill) || true" 24 | 25 | # setup the remote reloader 26 | aws-ec2-ssh $name --no-tty -yc " 27 | cd ~/bsv 28 | ((find -type f -name '*.c' -o -name '*.h' | entr -r bash -c 'sudo rm -f /usr/local/bin/b* && make -j && sudo mv -fv bin/* /usr/local/bin') &> ~/bsv.log Makefile 6 | echo "CFLAGS=\${CC_EXTRA} -Wno-int-conversion -Wno-incompatible-pointer-types -Wno-discarded-qualifiers -Iutil -Ivendor -flto -O3 -march=native -mtune=native -lm" >> Makefile 7 | echo ALL=clean docs $(for src in src/*.c; do 8 | if basename $src | grep ^_ &>/dev/null; then 9 | basename $src | cut -d. -f1 10 | else 11 | basename $src | cut -d. -f1 | tr '_' '-' 12 | fi 13 | done) >> Makefile 14 | echo >> Makefile 15 | 16 | echo "all: \$(ALL)" >> Makefile 17 | echo >> Makefile 18 | 19 | echo setup: >> Makefile 20 | echo -e '\tmkdir -p bin' >> Makefile 21 | echo -e '\t./scripts/version.sh &>/dev/null' >> Makefile 22 | echo >> Makefile 23 | 24 | echo clean: setup >> Makefile 25 | echo -e '\tcd bin && rm -f -- * *.*' >> Makefile 26 | echo >> Makefile 27 | 28 | echo docs: >> Makefile 29 | echo -e '\t./scripts/update_readme.py' >> Makefile 30 | echo >> Makefile 31 | 32 | echo test: setup >> Makefile 33 | echo -e '\ttox' >> Makefile 34 | echo >> Makefile 35 | 36 | for path in src/*.c; do 37 | if basename $path | grep ^_ &>/dev/null; then 38 | name=$(basename $path | cut -d. -f1) 39 | else 40 | name=$(basename $path | cut -d. -f1 | tr '_' '-') 41 | fi 42 | echo "$name: setup" >> Makefile 43 | echo -e "\tgcc vendor/lz4.c $path -o bin/$name \$(CFLAGS)" >> Makefile 44 | 45 | echo >> Makefile 46 | if ! cat .gitignore | grep ^$name$ &>/dev/null; then 47 | echo $name >> .gitignore 48 | fi 49 | done 50 | -------------------------------------------------------------------------------- /scripts/update_readme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import subprocess 4 | 5 | os.chdir(os.path.dirname(os.path.dirname(__file__))) 6 | co = lambda *a: subprocess.check_output(' '.join(map(str, a)), shell=True, executable='/bin/bash').decode('utf-8').strip() 7 | 8 | with open('readme.md') as f: 9 | xs = f.read().splitlines() 10 | 11 | before = [] 12 | for x in xs: 13 | before.append(x) 14 | if x.startswith('## tools'): 15 | before.append('') 16 | break 17 | 18 | before.append('| name | description |') 19 | before.append('| -- | -- |') 20 | 21 | after = [] 22 | 23 | for path in co('ls src/*.c').splitlines(): 24 | name = path 25 | if not path.split('/')[-1].startswith('_'): 26 | with open(path) as f: 27 | xs = f.read().splitlines() 28 | try: 29 | assert any(x.strip() == 'SETUP();' for x in xs), path 30 | name = path.split('/')[-1].split('.c')[0] 31 | description = [x for x in xs if x.startswith('#define DESCRIPTION')][0].replace('\\n', '\n').split('"')[1] 32 | usage = [x for x in xs if x.startswith('#define USAGE')][0].replace('\\n', '\n').split('"')[1] 33 | try: 34 | example = [x for x in xs if x.startswith('#define EXAMPLE')][0].replace('\\n', '\n').split('"')[1] 35 | except IndexError: 36 | example = '' 37 | while True: 38 | x = xs.pop(0) 39 | if x.startswith('#define EXAMPLE'): 40 | break 41 | while True: 42 | x = xs.pop(0) 43 | if not x.strip('"'): 44 | break 45 | x = x.replace('\\n', '\n').split('"')[1] 46 | if x.strip(): 47 | example += x 48 | except: 49 | print(f'fatal: failed to parse docs in file: {name}.c') 50 | raise 51 | if not name.startswith('_'): 52 | name = name.replace('_', '-') 53 | before.append(f'| [{name}](#{name}) | {description.rstrip()} |'.strip()) 54 | after.append(f'\n### [{name}](https://github.com/nathants/bsv/blob/master/src/{name.replace("-", "_")}.c)\n\n{description}```bash\nusage: {usage.strip()}\n```\n\n```bash\n{example.rstrip()}\n```') 55 | 56 | with open('readme.md', 'w') as f: 57 | f.write('\n'.join(before + after) + '\n') 58 | -------------------------------------------------------------------------------- /scripts/version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | cd $(dirname $(dirname $0)) 5 | 6 | hash=$(git log -1 --pretty=%H || echo -) 7 | date=$(date -u +'%Y-%m-%dT%H:%M:%SZ') 8 | 9 | if [ -z "$(git status --porcelain)" ]; then 10 | devel=false 11 | else 12 | devel=true 13 | fi 14 | 15 | cat - <util/version.h 16 | #define VERSION_GIT_HASH "git: $hash" 17 | #define VERSION_DATE "date: $date" 18 | #define VERSION_DEVEL "devel: $devel" 19 | EOF 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | setuptools.setup( 3 | version="0.0.1", 4 | license='mit', 5 | name='bsv', 6 | author='nathan todd-stone', 7 | author_email='me@nathants.com', 8 | url='http://github.com/nathants/bsv', 9 | py_modules=['bsv'], 10 | ) 11 | -------------------------------------------------------------------------------- /src/_bcopy.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "pass through data, to benchmark load/dump performance\n\n" 6 | #define USAGE "... | bcopy \n\n" 7 | #define EXAMPLE ">> echo a,b,c | bsv | _bcopy | csv\na,b,c\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | row_t row; 18 | 19 | // process input row by row 20 | while (1) { 21 | load_next(&rbuf, &row, 0); 22 | if (row.stop) 23 | break; 24 | dump(&wbuf, &row, 0); 25 | } 26 | dump_flush(&wbuf, 0); 27 | } 28 | -------------------------------------------------------------------------------- /src/_bcopyraw.c: -------------------------------------------------------------------------------- 1 | #define READ_GROWING 2 | #include "load.h" 3 | #include "dump.h" 4 | #include "array.h" 5 | #include "argh.h" 6 | 7 | #define DESCRIPTION "pass through data, to benchmark raw load/dump performance\n\n" 8 | #define USAGE "... | bcopy \n\n" 9 | #define EXAMPLE ">> echo a,b,c | bsv | _bcopyraw | csv\na,b,c\n" 10 | 11 | int main(int argc, char **argv) { 12 | 13 | // setup bsv 14 | SETUP(); 15 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // setup state 19 | row_t row; 20 | raw_row_t raw_row; 21 | ARRAY_INIT(array, raw_row_t*); 22 | 23 | // read 24 | while (1) { 25 | load_next(&rbuf, &row, 0); 26 | if (row.stop) 27 | break; 28 | row_to_raw(&row, &raw_row); 29 | dump_raw(&wbuf, &raw_row, 0); 30 | } 31 | dump_flush(&wbuf, 0); 32 | } 33 | -------------------------------------------------------------------------------- /src/_copy.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | int main(int argc, char **argv) { 4 | INCREASE_PIPE_SIZES(); 5 | u8 *buff; 6 | i32 size = 1024 * 16; 7 | buff = malloc(size); 8 | i32 wbytes, rbytes; 9 | while (1) { 10 | rbytes = fread_unlocked(buff, 1, size, stdin); 11 | wbytes = fwrite_unlocked(buff, 1, rbytes, stdout); 12 | ASSERT(wbytes == rbytes, "fatal: bad write\n"); 13 | if (rbytes != size) 14 | break; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/_csv.c: -------------------------------------------------------------------------------- 1 | #include "csv.h" 2 | #include "util.h" 3 | 4 | int main(int argc, char **argv) { 5 | SIGPIPE_HANDLER(); 6 | CSV_INIT(); 7 | while (1) { 8 | CSV_READ_LINE(stdin); 9 | if (csv_stop) 10 | break; 11 | for (int i = 0; i <= csv_max; i++) { 12 | fwrite(csv_columns[i], sizeof(char), csv_sizes[i], stdout); 13 | fwrite("\n", sizeof(char), 1, stdout); 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/_gen_bsv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "util.h" 8 | #include "dump.h" 9 | 10 | void showusage() { 11 | FPRINTF(stderr, "\nusage: $ _gen_bsv NUM_COLUMNS NUM_ROWS MAXNUM_CHARS [--bytes]\n"); 12 | exit(1); 13 | } 14 | 15 | int main(int argc, char **argv) { 16 | SIGPIPE_HANDLER(); 17 | if (argc < 4) 18 | showusage(); 19 | i32 num_columns = atoi(argv[1]); 20 | i64 num_rows = atol(argv[2]); 21 | i32 max_chars = atoi(argv[3]) + 1; 22 | ASSERT(num_columns >= 0, "fatal: num_columns < 0"); 23 | ASSERT(num_rows >= 0, "fatal: num_rows < 0"); 24 | bool bytes = false; 25 | if (argc == 5 && strcmp(argv[4], "--bytes") == 0) 26 | bytes = true; 27 | 28 | // setup bsv 29 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 30 | 31 | struct timespec ts; 32 | clock_gettime(CLOCK_MONOTONIC, &ts); 33 | srand(ts.tv_nsec); 34 | 35 | u8 *word; 36 | i32 size; 37 | i32 index; 38 | row_t row; 39 | u8 *buffer; 40 | MALLOC(buffer, BUFFER_SIZE); 41 | i32 i = 0; 42 | i32 offset; 43 | while (i++ < num_rows) { 44 | offset = 0; 45 | row.max = 0; 46 | for (i32 j = 0; j < num_columns; j++) { 47 | i32 num_chars = rand() % max_chars; 48 | for (i32 k = 0; k < num_chars; k++) { 49 | i32 val = rand(); 50 | if (bytes) { 51 | buffer[offset + k] = val; 52 | } else { 53 | if (val % 2) { 54 | buffer[offset + k] = (val % (122 - 97)) + 97; // a-z 55 | } else { 56 | buffer[offset + k] = (val % (57 - 48)) + 48; // 0-9 57 | } 58 | } 59 | } 60 | row.sizes[j] = num_chars; 61 | row.columns[j] = buffer + offset; 62 | offset += num_chars; 63 | row.max = j; 64 | 65 | } 66 | dump(&wbuf, &row, 0); 67 | } 68 | dump_flush(&wbuf, 0); 69 | } 70 | -------------------------------------------------------------------------------- /src/_gen_csv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "util.h" 7 | 8 | void showusage() { 9 | FPRINTF(stderr, "\nusage: $ gen-csv NUM_COLUMNS NUM_ROWS MAXNUM_CHARS\n"); 10 | exit(1); 11 | } 12 | 13 | int main(int argc, char **argv) { 14 | SIGPIPE_HANDLER(); 15 | if (argc < 4) 16 | showusage(); 17 | i32 num_columns = atoi(argv[1]); 18 | i64 num_rows = atol(argv[2]); 19 | i32 max_chars = atoi(argv[3]) + 1; 20 | ASSERT(num_columns >= 0, "fatal: num_columns < 0"); 21 | ASSERT(num_rows >= 0, "fatal: num_rows < 0"); 22 | i32 num_words, add_delimiter; 23 | 24 | struct timespec ts; 25 | clock_gettime(CLOCK_MONOTONIC, &ts); 26 | srand(ts.tv_nsec); 27 | 28 | i32 i = 0; 29 | while (i++ < num_rows) { 30 | add_delimiter = 0; 31 | for (i32 j = 0; j < num_columns; j++) { 32 | if (add_delimiter) 33 | FPUTS(","); 34 | i32 num_chars = rand() % max_chars; 35 | for (i32 i = 0; i < num_chars; i++) { 36 | i32 val = rand(); 37 | char c[1]; 38 | if (val % 2) { 39 | c[0] = (val % (122 - 97)) + 97; // a-z 40 | } else { 41 | c[0] = (val % (57 - 48)) + 48; // 0-9 42 | } 43 | FPUTS(c); 44 | 45 | } 46 | add_delimiter = 1; 47 | } 48 | FPUTS("\n"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/_queue.c: -------------------------------------------------------------------------------- 1 | #include "csv.h" 2 | #include "util.h" 3 | #include "queue.h" 4 | 5 | int main(int argc, char **argv) { 6 | SIGPIPE_HANDLER(); 7 | CSV_INIT(); 8 | ASSERT(argc == 2, "argc: %d != 2\n", argc); 9 | int capacity = atoi(argv[1]); 10 | queue_t *q = queue_init(capacity); 11 | while (1) { 12 | CSV_READ_LINE(stdin); 13 | if (csv_stop) 14 | break; 15 | ASSERT(0 == csv_max, "csv_max: %d != 0\n", csv_max); 16 | u8 *action = csv_columns[0]; 17 | i32 size = csv_sizes[0]; 18 | u8 *val; 19 | if (strncmp(action, "get", 3) == 0) { 20 | val = queue_get(q); 21 | if (val) { 22 | printf("%s\n", val); 23 | free(val); 24 | } else { 25 | printf("empty\n"); 26 | } 27 | } else if (strncmp(action, "put", 3) == 0) { 28 | action += 4; // action = "put VALUE" 29 | MALLOC(val, size - 4); 30 | memset(val, 0, size - 4); 31 | strncpy(val, action, size - 4); 32 | if (queue_put(q, val)) { 33 | printf("full\n"); 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/bcat.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "argh.h" 3 | #include "load.h" 4 | #include "write_simple.h" 5 | 6 | #define DESCRIPTION "cat some bsv files to csv\n\n" 7 | #define USAGE "bcat [-l|--lz4] [-p|--prefix] [-h N|--head N] FILE1 ... FILEN\n\n" 8 | #define EXAMPLE \ 9 | ">> for char in a a b b c c; do\n" \ 10 | " echo $char | bsv >> /tmp/$char\n" \ 11 | " done\n\n" \ 12 | ">> bcat --head 1 --prefix /tmp/{a,b,c}\n" \ 13 | "/tmp/a:a\n" \ 14 | "/tmp/b:b\n" \ 15 | "/tmp/c:c\n" 16 | 17 | int main(int argc, char **argv) { 18 | // setup bsv 19 | SETUP(); 20 | 21 | // setup state 22 | i32 ran = 0; 23 | i64 line; 24 | 25 | // parse args 26 | bool prefix = false; 27 | bool lz4 = false; 28 | i64 head = 0; 29 | ARGH_PARSE { 30 | ARGH_NEXT(); 31 | if ARGH_BOOL("-p", "--prefix") { prefix = true; } 32 | else if ARGH_BOOL("-l", "--lz4") { lz4 = true; } 33 | else if ARGH_FLAG("-h", "--head") { ASSERT(isdigits(ARGH_VAL()), "fatal: should have been `--head INT`, not `--head %s`\n", ARGH_VAL()); 34 | head = atol(ARGH_VAL());} 35 | } 36 | 37 | // setup input 38 | ASSERT(ARGH_ARGC > 0, "usage: %s", USAGE); 39 | FILE *files[ARGH_ARGC]; 40 | for (i32 i = 0; i < ARGH_ARGC; i++) 41 | FOPEN(files[i], ARGH_ARGV[i], "rb"); 42 | readbuf_t rbuf = rbuf_init(files, ARGH_ARGC, lz4); 43 | row_t row; 44 | 45 | // setup output 46 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1); 47 | 48 | // process input row by row 49 | for (i32 i = 0; i < ARGH_ARGC; i++) { 50 | line = 0; 51 | while (1) { 52 | line++; 53 | load_next(&rbuf, &row, i); 54 | if (row.stop) 55 | break; 56 | if (head != 0 && line > head) 57 | break; 58 | if (prefix) { 59 | write_bytes(&wbuf, ARGH_ARGV[i], strlen(ARGH_ARGV[i]), 0); 60 | write_bytes(&wbuf, ":", 1, 0); 61 | } 62 | for (i32 j = 0; j <= row.max; j++) { 63 | write_bytes(&wbuf, row.columns[j], row.sizes[j], 0); 64 | if (j != row.max) 65 | write_bytes(&wbuf, ",", 1, 0); 66 | } 67 | write_bytes(&wbuf, "\n", 1, 0); 68 | ran = 1; 69 | } 70 | } 71 | if (ran == 0) 72 | write_bytes(&wbuf, "\n", 1, 0); 73 | write_flush(&wbuf, 0); 74 | } 75 | -------------------------------------------------------------------------------- /src/bcombine.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "prepend a new column by combining values from existing columns\n\n" 6 | #define USAGE "... | bcombine COL1,...,COLN\n\n" 7 | #define EXAMPLE ">> echo a,b,c | bsv | bcombine 3,2 | csv\nb:a,a,b,c\n" 8 | 9 | #define PARSE_ARGV() \ 10 | do { \ 11 | ASSERT(argc == 2, "usage: %s", USAGE); \ 12 | char *f; \ 13 | char *fs = (char*)argv[1]; \ 14 | while ((f = strsep(&fs, ","))) { \ 15 | index = atoi(f); \ 16 | indices[num_fields++] = index - 1; \ 17 | ASSERT(index <= MAX_COLUMNS, "fatal: cannot select indices above %d, tried to select: %d\n", MAX_COLUMNS, index); \ 18 | ASSERT(index > 0, "fatal: indices must be gte 0, got: %d", index); \ 19 | } \ 20 | ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d indices\n", MAX_COLUMNS); \ 21 | } while (0) 22 | 23 | int main(int argc, char **argv) { 24 | 25 | // setup bsv 26 | SETUP(); 27 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 28 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 29 | 30 | // setup state 31 | i32 num_fields = 0; 32 | i32 indices[MAX_COLUMNS]; 33 | i32 index; 34 | PARSE_ARGV(); 35 | row_t row; 36 | row_t new; 37 | u8 *buffer; 38 | MALLOC(buffer, BUFFER_SIZE); 39 | i32 size; 40 | 41 | // process input row by row 42 | while (1) { 43 | load_next(&rbuf, &row, 0); 44 | if (row.stop) 45 | break; 46 | for (i32 i = 0; i <= row.max; i++) { 47 | new.sizes[i + 1] = row.sizes[i]; 48 | new.columns[i + 1] = row.columns[i]; 49 | } 50 | size = 0; 51 | for (i32 i = 0; i < num_fields; i++) { 52 | index = indices[i]; 53 | ASSERT(index <= row.max, "fatal: line with %d columns, needed %d\n", row.max + 1, index + 1); 54 | ASSERT(size + row.sizes[size] < BUFFER_SIZE, "fatal: bcombine buffer overflow\n"); 55 | memcpy(buffer + size, row.columns[index], row.sizes[size]); 56 | size += row.sizes[size]; 57 | if (i < num_fields - 1) { 58 | buffer[size] = ':'; 59 | size++; 60 | } 61 | } 62 | new.columns[0] = buffer; 63 | new.sizes[0] = size; 64 | new.max = row.max + 1; 65 | dump(&wbuf, &new, 0); 66 | } 67 | dump_flush(&wbuf, 0); 68 | } 69 | -------------------------------------------------------------------------------- /src/bcounteach.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "count as i64 each contiguous identical row by the first column\n\n" 6 | #define USAGE "... | bcounteach\n\n" 7 | #define EXAMPLE "echo '\na\na\nb\nb\nb\na\n' | bsv | bcounteach | bschema *,i64:a | csv\na,2\nb,3\na,1\n" 8 | 9 | #define DUMP_COUNT() \ 10 | do { \ 11 | if (size > 0) { \ 12 | new.columns[0] = buffer; \ 13 | new.sizes[0] = size; \ 14 | new.columns[1] = &count; \ 15 | new.sizes[1] = sizeof(i64); \ 16 | new.max = 1; \ 17 | dump(&wbuf, &new, 0); \ 18 | } \ 19 | } while(0) 20 | 21 | int main(int argc, char **argv) { 22 | 23 | // setup bsv 24 | SETUP(); 25 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 26 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 27 | 28 | // setup state 29 | i64 count = 0; 30 | i32 size = 0; 31 | u8 *buffer; 32 | row_t row; 33 | row_t new; 34 | MALLOC(buffer, BUFFER_SIZE); 35 | 36 | // process input row by row 37 | while (1) { 38 | load_next(&rbuf, &row, 0); 39 | if (row.stop) 40 | break; 41 | count++; 42 | if (compare_str(buffer, row.columns[0]) != 0) { 43 | DUMP_COUNT(); 44 | memcpy(buffer, row.columns[0], row.sizes[0] + 1); // +1 for the trailing \0 45 | size = row.sizes[0]; 46 | count = 0; 47 | } 48 | } 49 | 50 | // flush last value 51 | count += 1; 52 | DUMP_COUNT(); 53 | dump_flush(&wbuf, 0); 54 | } 55 | -------------------------------------------------------------------------------- /src/bcounteach_hash.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | #include "map.h" 5 | 6 | #define DESCRIPTION "count as i64 by hash of the first column\n\n" 7 | #define USAGE "... | bcounteach-hash\n\n" 8 | #define EXAMPLE "echo '\na\na\nb\nb\nb\na\n' | bsv | bcounteach-hash | bschema *,i64:a | bsort | csv\na,3\nb,3\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 15 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 16 | 17 | // setup state 18 | row_t row; 19 | 20 | MAP_INIT(counts, i64, 1<<16); 21 | MAP_ALLOC(counts, i64); 22 | 23 | while (1) { 24 | load_next(&rbuf, &row, 0); 25 | if (row.stop) { 26 | break; 27 | } 28 | MAP_SET_INDEX(counts, row.columns[0], row.sizes[0], i64); 29 | MAP_VALUE(counts)++; 30 | } 31 | 32 | for (i32 i = 0; i < MAP_SIZE(counts); i++) { 33 | if (MAP_KEYS(counts)[i] != NULL) { 34 | row.max = 1; 35 | row.columns[0] = MAP_KEYS(counts)[i]; 36 | row.sizes[0] = MAP_SIZES(counts)[i]; 37 | row.columns[1] = &MAP_VALUES(counts)[i]; 38 | row.sizes[1] = sizeof(i64); 39 | dump(&wbuf, &row, 0); 40 | } 41 | } 42 | dump_flush(&wbuf, 0); 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/bcountrows.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "count rows as i64\n\n" 6 | #define USAGE "... | bcountrows\n\n" 7 | #define EXAMPLE ">> echo '\n1\n2\n3\n4\n' | bsv | bcountrows | csv\n4\n\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | i64 count = 0; 18 | row_t row; 19 | 20 | // process input row by row 21 | while (1) { 22 | load_next(&rbuf, &row, 0); 23 | if (row.stop) 24 | break; 25 | count++; 26 | } 27 | 28 | // output value 29 | row.max = 0; 30 | row.columns[0] = &count; 31 | row.sizes[0] = sizeof(i64); 32 | dump(&wbuf, &row, 0); 33 | dump_flush(&wbuf, 0); 34 | } 35 | -------------------------------------------------------------------------------- /src/bcut.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "select some columns\n\n" 6 | #define USAGE "... | bcut COL1,...,COLN\n\n" 7 | #define EXAMPLE ">> echo a,b,c | bsv | bcut 3,3,3,2,2,1 | csv\nc,c,c,b,b,a\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | i32 num_fields = 0; 18 | i32 indices[MAX_COLUMNS]; 19 | i32 index; 20 | ASSERT(argc == 2, "usage: %s", USAGE); 21 | char *f; 22 | char *fs = (char*)argv[1]; 23 | while ((f = strsep(&fs, ","))) { 24 | index = atoi(f); 25 | indices[num_fields++] = index - 1; 26 | ASSERT(index <= MAX_COLUMNS, "fatal: cannot select indices above %d, tried to select: %d\n", MAX_COLUMNS, index); 27 | ASSERT(index > 0, "fatal: indices must be gte 0, got: %d", index); 28 | } 29 | ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d indices\n", MAX_COLUMNS); 30 | row_t row; 31 | row_t new; 32 | 33 | // process input row by row 34 | while (1) { 35 | load_next(&rbuf, &row, 0); 36 | if (row.stop) 37 | break; 38 | for (i32 i = 0; i < num_fields; i++) { 39 | index = indices[i]; 40 | ASSERT(index <= row.max, "fatal: line with %d columns, needed %d\n", row.max + 1, index + 1); 41 | new.columns[i] = row.columns[index]; 42 | new.sizes[i] = row.sizes[index]; 43 | } 44 | new.max = num_fields - 1; 45 | dump(&wbuf, &new, 0); 46 | } 47 | dump_flush(&wbuf, 0); 48 | } 49 | -------------------------------------------------------------------------------- /src/bdedupe.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "dedupe identical contiguous rows by the first column, keeping the first\n\n" 6 | #define USAGE "... | bdedupe\n\n" 7 | #define EXAMPLE ">> echo '\na\na\nb\nb\na\na\n' | bsv | bdedupe | csv\na\nb\na\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | u8 *buffer; 18 | MALLOC(buffer, BUFFER_SIZE); 19 | memset(buffer, 0, BUFFER_SIZE); 20 | row_t row; 21 | 22 | // process input row by row 23 | while (1) { 24 | load_next(&rbuf, &row, 0); 25 | if (row.stop) 26 | break; 27 | if (compare_str(buffer, row.columns[0]) != 0) { 28 | dump(&wbuf, &row, 0); 29 | memcpy(buffer, row.columns[0], row.sizes[0] + 1); // +1 for the trailing \0 30 | } 31 | } 32 | dump_flush(&wbuf, 0); 33 | } 34 | -------------------------------------------------------------------------------- /src/bdedupe_hash.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | #include "map.h" 5 | 6 | #define DESCRIPTION "dedupe rows by hash of the first column, keeping the first\n\n" 7 | #define USAGE "... | bdedupe-hash\n\n" 8 | #define EXAMPLE ">> echo '\na\na\nb\nb\na\na\n' | bsv | bdedupe-hash | csv\na\nb\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 15 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 16 | 17 | // setup state 18 | row_t row; 19 | MAP_INIT(dupes, u8, 1<<16); 20 | MAP_ALLOC(dupes, u8); 21 | 22 | // process input row by row 23 | while (1) { 24 | load_next(&rbuf, &row, 0); 25 | if (row.stop) 26 | break; 27 | MAP_SET_INDEX(dupes, row.columns[0], row.sizes[0], u8); 28 | if (MAP_VALUE(dupes) == 0) { 29 | MAP_VALUE(dupes) = 1; 30 | dump(&wbuf, &row, 0); 31 | } 32 | } 33 | dump_flush(&wbuf, 0); 34 | } 35 | -------------------------------------------------------------------------------- /src/bdropuntil.c: -------------------------------------------------------------------------------- 1 | #include "read_ahead.h" 2 | #include "util.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | 6 | #define DESCRIPTION "for sorted input, drop until the first column is gte to VALUE\n\n" 7 | #define USAGE "... | bdropuntil VALUE [TYPE]\n\n" 8 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | bdropuntil c | csv\nc\nd\n\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 15 | readaheadbuf_t rabuf = rabuf_init(1); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // setup state 19 | bool done_skipping = false; 20 | bool matched = false; 21 | i32 cmp; 22 | row_t row; 23 | ASSERT(argc >= 2, "usage: %s", USAGE); 24 | i64 val_i64; 25 | i32 val_i32; 26 | i16 val_i16; 27 | u64 val_u64; 28 | u32 val_u32; 29 | u16 val_u16; 30 | f64 val_f64; 31 | f32 val_f32; 32 | void *val; 33 | i32 value_type; 34 | if (argc == 2) { 35 | val = argv[1]; 36 | value_type = STR; 37 | } else { 38 | ASSERT(argc == 3, "usage: %s", USAGE); 39 | if (strcmp(argv[2], "i64") == 0) { value_type = I64; val_i64 = atol(argv[1]); val = &val_i64; } 40 | else if (strcmp(argv[2], "i32") == 0) { value_type = I32; val_i32 = atol(argv[1]); val = &val_i32; } 41 | else if (strcmp(argv[2], "i16") == 0) { value_type = I16; val_i16 = atol(argv[1]); val = &val_i16; } 42 | else if (strcmp(argv[2], "u64") == 0) { value_type = U64; val_u64 = atol(argv[1]); val = &val_u64; } 43 | else if (strcmp(argv[2], "u32") == 0) { value_type = U32; val_u32 = atol(argv[1]); val = &val_u32; } 44 | else if (strcmp(argv[2], "u16") == 0) { value_type = U16; val_u16 = atol(argv[1]); val = &val_u16; } 45 | else if (strcmp(argv[2], "f64") == 0) { value_type = F64; val_f64 = atof(argv[1]); val = &val_f64; } 46 | else if (strcmp(argv[2], "f32") == 0) { value_type = F32; val_f32 = atof(argv[1]); val = &val_f32; } 47 | else ASSERT(0, "fatal: bad type %s\n", argv[2]); 48 | } 49 | 50 | // process input row by row 51 | while (1) { 52 | load_next(&rbuf, &row, 0); 53 | if (row.stop) { // ----------------------------------------------- reached the last chunk and possibly need to backup to the previous chunk to find a match 54 | if (done_skipping) { // -------------------------------------- already gone back to the previous chunk, time to stop 55 | break; 56 | } else { // -------------------------------------------------- go back and check the entire last chunk for a match 57 | read_goto_last_chunk(&rbuf, &rabuf, 0); 58 | done_skipping = true; 59 | } 60 | } else { // ------------------------------------------------------ reading data chunk by chunk, checking the first row and the proceeding to the next chunk 61 | ASSERT_SIZE(value_type, row.sizes[0]); 62 | if (matched) { // -------------------------------------------- once a match is found dump every row 63 | dump(&wbuf, &row, 0); 64 | } else { // -------------------------------------------------- check for a match 65 | cmp = compare(value_type, row.columns[0], val); 66 | if (done_skipping) { // ---------------------------------- since we are done skipping ahead by chunks, check every row for a match 67 | if (cmp >= 0) { 68 | dump(&wbuf, &row, 0); 69 | matched = true; 70 | } 71 | } else if (cmp < 0) { // --------------------------------- we aren't done skipping ahead, we want to keep skipping until we've gone too far 72 | read_goto_next_chunk(&rbuf, &rabuf, 0); 73 | } else { // ---------------------------------------------- we've gone too far, time to backup one chunk and start checking every row 74 | read_goto_last_chunk(&rbuf, &rabuf, 0); 75 | done_skipping = true; 76 | } 77 | } 78 | } 79 | } 80 | dump_flush(&wbuf, 0); 81 | } 82 | -------------------------------------------------------------------------------- /src/bhead.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "keep the first n rows\n\n" 6 | #define USAGE "... | bhead N\n\n" 7 | #define EXAMPLE ">> echo '\na\nb\nc\n' | bsv | btail 2 | csv\na\nb\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | ASSERT(argc == 2 && isdigits(argv[1]), "usage: %s", USAGE); 18 | row_t row; 19 | i64 max = atol(argv[1]); 20 | i64 count = 0; 21 | 22 | // process input row by row 23 | while (1) { 24 | load_next(&rbuf, &row, 0); 25 | if (row.stop || count++ >= max) 26 | break; 27 | dump(&wbuf, &row, 0); 28 | } 29 | dump_flush(&wbuf, 0); 30 | } 31 | -------------------------------------------------------------------------------- /src/blz4.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "lz4.h" 3 | 4 | #define DESCRIPTION "compress bsv data\n\n" 5 | #define USAGE "... | blz4 \n\n" 6 | #define EXAMPLE ">> echo a,b,c | bsv | blz4 | blz4d | csv\na,b,c\n" 7 | 8 | int main(int argc, char **argv) { 9 | 10 | // setup bsv 11 | SETUP(); 12 | 13 | // setup state 14 | i32 size; 15 | i32 lz4_size; 16 | u8 *buf; 17 | MALLOC(buf, BUFFER_SIZE); 18 | u8 *lz4_buf; 19 | MALLOC(lz4_buf, BUFFER_SIZE_LZ4); 20 | ASSERT(LZ4_compressBound(BUFFER_SIZE) <= BUFFER_SIZE_LZ4, "fatal: lz4 compress bound\n"); 21 | 22 | // process input row by row 23 | while (1) { 24 | if (0 == fread_unlocked(&size, 1, sizeof(i32), stdin)) 25 | break; 26 | FREAD(buf, size, stdin); 27 | lz4_size = LZ4_compress_fast(buf, lz4_buf, size, BUFFER_SIZE_LZ4, LZ4_ACCELERATION); 28 | FWRITE(&size, sizeof(i32), stdout); 29 | FWRITE(&lz4_size, sizeof(i32), stdout); 30 | FWRITE(lz4_buf, lz4_size, stdout); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/blz4d.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "lz4.h" 3 | 4 | #define DESCRIPTION "decompress bsv data\n\n" 5 | #define USAGE "... | blz4d \n\n" 6 | #define EXAMPLE ">> echo a,b,c | bsv | blz4 | blz4d | csv\na,b,c\n" 7 | 8 | int main(int argc, char **argv) { 9 | 10 | // setup bsv 11 | SETUP(); 12 | 13 | // setup state 14 | i32 size; 15 | i32 lz4_size; 16 | u8 *buf; 17 | MALLOC(buf, BUFFER_SIZE); 18 | u8 *lz4_buf; 19 | MALLOC(lz4_buf, BUFFER_SIZE_LZ4); 20 | ASSERT(LZ4_compressBound(BUFFER_SIZE) <= BUFFER_SIZE_LZ4, "fatal: lz4 compress bound\n"); 21 | 22 | // process input row by row 23 | while (1) { 24 | if (0 == fread_unlocked(&size, 1, sizeof(i32), stdin)) 25 | break; 26 | FREAD(&lz4_size, sizeof(i32), stdin); 27 | FREAD(lz4_buf, lz4_size, stdin); 28 | ASSERT(size == LZ4_decompress_safe(lz4_buf, buf, lz4_size, BUFFER_SIZE), "fatal: decompress size mismatch\n"); 29 | FWRITE(&size, sizeof(i32), stdout); 30 | FWRITE(buf, size, stdout); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/bpartition.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "argh.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | #include "xxh3.h" 6 | #include 7 | #include 8 | #include 9 | 10 | #define SEED 0 11 | 12 | #define DESCRIPTION "split into multiple files by consistent hash of the first column value\n\n" 13 | #define USAGE "\n... | bpartition NUM_BUCKETS [PREFIX] [-l|--lz4]\n\n" 14 | #define EXAMPLE ">> echo '\na\nb\nc\n' | bsv | bpartition 10 prefix\nprefix03\nprefix06\n" 15 | 16 | int empty_file(char *path) { 17 | struct stat st; 18 | if (stat(path, &st) == 0) 19 | return st.st_size == 0; 20 | return -1; 21 | } 22 | 23 | int main(int argc, char **argv) { 24 | 25 | // setup bsv 26 | SETUP(); 27 | 28 | // setup input 29 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 30 | 31 | // setup state 32 | row_t row; 33 | u8 *prefix; 34 | u8 num_buckets_str[16]; 35 | u8 path[1024]; 36 | i32 empty; 37 | i32 num_buckets; 38 | u64 file_num; 39 | u64 hash; 40 | 41 | // parse args 42 | bool lz4 = false; 43 | ARGH_PARSE { 44 | ARGH_NEXT(); 45 | if ARGH_BOOL("-l", "--lz4") { lz4 = true; } 46 | } 47 | ASSERT(ARGH_ARGC >= 1, "usage: %s", USAGE); 48 | ASSERT(strlen(ARGH_ARGV[0]) <= 8, "NUM_BUCKETS must be less than 1e8, got: %s\n", argv[1]); 49 | num_buckets = atoi(ARGH_ARGV[0]); 50 | ASSERT(num_buckets > 0, "NUM_BUCKETS must be positive, got: %d\n", num_buckets); 51 | if (ARGH_ARGC == 2) { 52 | prefix = ARGH_ARGV[1]; 53 | } else { 54 | prefix = ""; 55 | } 56 | 57 | // open output files 58 | FILE *files[num_buckets]; 59 | SNPRINTF(num_buckets_str, sizeof(num_buckets_str), "%d", num_buckets); 60 | for (i32 i = 0; i < num_buckets; i++) { 61 | if (strlen(prefix) != 0) 62 | SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_buckets_str), i); 63 | else 64 | SNPRINTF(path, sizeof(path), "%0*d", strlen(num_buckets_str), i); 65 | FOPEN(files[i], path, "ab"); 66 | } 67 | 68 | // setup output 69 | writebuf_t wbuf = wbuf_init(files, num_buckets, lz4); 70 | 71 | // for 1 bucket, pipe the data straight through 72 | if (num_buckets == 1) { 73 | i32 rbytes; 74 | i32 chunk_size; 75 | while (1) { 76 | rbytes = fread_unlocked(&chunk_size, 1, sizeof(i32), rbuf.files[0]); 77 | ASSERT(rbytes == 0 || rbytes == sizeof(i32), "fatal: bad bpartition chunk read %d\n", rbytes); 78 | if (rbytes != sizeof(i32)) 79 | break; 80 | FREAD(wbuf.buffer[0], chunk_size, rbuf.files[0]); 81 | wbuf.offset[0] = chunk_size; 82 | write_flush(&wbuf, 0); 83 | } 84 | 85 | // for more than 1 bucket, process input row by row 86 | } else { 87 | while (1) { 88 | load_next(&rbuf, &row, 0); 89 | if (row.stop) 90 | break; 91 | hash = XXH3_64bits(row.columns[0], row.sizes[0]); 92 | file_num = hash % num_buckets; 93 | dump(&wbuf, &row, file_num); 94 | } 95 | } 96 | 97 | // flush and close 98 | for (i32 i = 0; i < num_buckets; i++) { 99 | dump_flush(&wbuf, i); 100 | ASSERT(fclose(files[i]) != EOF, "fatal: failed to close files\n"); 101 | } 102 | 103 | // delete any empty output files 104 | for (i32 i = 0; i < num_buckets; i++) { 105 | if (strlen(prefix) != 0) 106 | SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_buckets_str), i); 107 | else 108 | SNPRINTF(path, sizeof(path), "%0*d", strlen(num_buckets_str), i); 109 | empty = empty_file(path); 110 | if (empty == 1) { 111 | ASSERT(remove(path) == 0, "fatal: failed to delete file: %s\n", path); 112 | } else { 113 | ASSERT(empty != -1, "fatal: failed to stat file: %s\n", path); 114 | FPRINTF(stdout, "%s\n", path); 115 | } 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/bquantile_merge.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "ddsketch.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | #include "argh.h" 6 | 7 | #define DESCRIPTION "merge ddsketches and output quantile value pairs as f64\n\n" 8 | #define USAGE "... | bquantile-merge QUANTILES \n\n" 9 | #define EXAMPLE ">> seq 1 100 | bsv | bschema a:i64 | bquantile-sketch i64 | bquantile-merge .2,.5,.7 | bschema f64:a,f64:a | csv\n0.2,19.88667024086646\n0.5,49.90296094906742\n0.7,70.11183939140405\n" 10 | 11 | int main(int argc, char **argv) { 12 | 13 | // setup bsv 14 | SETUP(); 15 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // parse args 19 | ASSERT(argc == 2, "usage: %s", USAGE); 20 | i32 num_quantiles = 0; 21 | f64 quantiles[MAX_COLUMNS]; 22 | f64 quantile; 23 | char *f; 24 | char *fs = (char*)argv[1]; 25 | while ((f = strsep(&fs, ","))) { 26 | ASSERT(isdigits_ordot(f), "fatal: bad arg\n"); 27 | quantile = atof(f); 28 | ASSERT(quantile >= 0 && quantile <= 1, "fatal: bad arg\n"); 29 | quantiles[num_quantiles++] = quantile; 30 | } 31 | 32 | // setup state 33 | row_t row; 34 | sketch_t *s = NULL; 35 | sketch_t *o; 36 | 37 | // process input row by row 38 | while (1) { 39 | load_next(&rbuf, &row, 0); 40 | if (row.stop) 41 | break; 42 | o = sketch_from_row(&row); 43 | if (s) { 44 | ASSERT(s->config->max_num_bins == o->config->max_num_bins, "fatal: must merge sketches with same config settings\n"); 45 | ASSERT(s->config->gamma == o->config->gamma, "fatal: must merge sketches with same config settings\n"); 46 | ASSERT(s->config->min_value == o->config->min_value, "fatal: must merge sketches with same config settings\n"); 47 | sketch_merge(s, o); 48 | } else 49 | s = o; 50 | } 51 | 52 | // dump quantiles 53 | f64 val; 54 | for (i32 i = 0; i < num_quantiles; i++) { 55 | row.max = 1; 56 | row.columns[0] = &quantiles[i]; 57 | row.sizes[0] = sizeof(f64); 58 | val = sketch_quantile(s, quantiles[i]); 59 | row.columns[1] = &val; 60 | row.sizes[1] = sizeof(f64); 61 | dump(&wbuf, &row, 0); 62 | } 63 | dump_flush(&wbuf, 0); 64 | } 65 | -------------------------------------------------------------------------------- /src/bquantile_sketch.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "ddsketch.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | #include "argh.h" 6 | 7 | #define DESCRIPTION "collapse the first column into a single row ddsketch\n\n" 8 | #define USAGE "... | bquantile-sketch TYPE [-a|--alpha] [-b|--max-bins] [-m|--min-value] \n\n" 9 | #define EXAMPLE ">> seq 1 100 | bsv | bschema a:i64 | bquantile-sketch i64 | bquantile-merge .2,.5,.7 | bschema f64:a,f64:a | csv\n0.2,19.88667024086646\n0.5,49.90296094906742\n0.7,70.11183939140405\n" 10 | 11 | int main(int argc, char **argv) { 12 | 13 | // setup bsv 14 | SETUP(); 15 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // parse args 19 | f64 alpha = DEFAULT_ALPHA; 20 | i32 max_num_bins = DEFAULT_MAX_NUM_BINS; 21 | f64 min_value = DEFAULT_MIN_VALUE; 22 | ARGH_PARSE { 23 | ARGH_NEXT(); 24 | if ARGH_FLAG("-a", "--alpha") { alpha = atof(ARGH_VAL()); ASSERT(isdigits_ordot(ARGH_VAL()), "fatal: bad arg\n"); } 25 | else if ARGH_FLAG("-m", "--min-value") { min_value = atof(ARGH_VAL()); ASSERT(isdigits_ordot(ARGH_VAL()), "fatal: bad arg\n"); } 26 | else if ARGH_FLAG("-b", "--max-bins") { max_num_bins = atoi(ARGH_VAL()); ASSERT(isdigits(ARGH_VAL()), "fatal: bad arg\n"); } 27 | } 28 | i32 value_type; 29 | ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE); 30 | if (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = I64; 31 | else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = I32; 32 | else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = I16; 33 | else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = U64; 34 | else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = U32; 35 | else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = U16; 36 | else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = F64; 37 | else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = F32; 38 | else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]); 39 | 40 | // setup state 41 | row_t row; 42 | config_t *c = config_new(alpha, max_num_bins, min_value); 43 | sketch_t *s = sketch_new(c); 44 | 45 | // process input row by row 46 | while (1) { 47 | load_next(&rbuf, &row, 0); 48 | if (row.stop) 49 | break; 50 | ASSERT_SIZE(value_type, row.sizes[0]); 51 | switch (value_type) { 52 | case I64: sketch_add(s, (f64)*(i64*)(row.columns[0])); break; 53 | case I32: sketch_add(s, (f64)*(i32*)(row.columns[0])); break; 54 | case I16: sketch_add(s, (f64)*(i16*)(row.columns[0])); break; 55 | case U64: sketch_add(s, (f64)*(u64*)(row.columns[0])); break; 56 | case U32: sketch_add(s, (f64)*(u32*)(row.columns[0])); break; 57 | case U16: sketch_add(s, (f64)*(u16*)(row.columns[0])); break; 58 | case F64: sketch_add(s, (f64)*(f64*)(row.columns[0])); break; 59 | case F32: sketch_add(s, (f64)*(f32*)(row.columns[0])); break; 60 | } 61 | } 62 | 63 | // dump sketch 64 | sketch_to_row(&row, s); 65 | dump(&wbuf, &row, 0); 66 | dump_flush(&wbuf, 0); 67 | } 68 | -------------------------------------------------------------------------------- /src/bsort.c: -------------------------------------------------------------------------------- 1 | #define READ_GROWING 2 | #include "load.h" 3 | #include "dump.h" 4 | #include "array.h" 5 | #include "argh.h" 6 | 7 | #define DESCRIPTION "timsort rows by the first column\n\n" 8 | #define USAGE "... | bsort [-r|--reversed] [TYPE]\n\n" 9 | #define EXAMPLE ">> echo '\n3\n2\n1\n' | bsv | bschema a:i64 | bsort i64 | bschema i64:a | csv\n1\n2\n3\n\n" 10 | 11 | #define SORT_NAME row 12 | #define SORT_TYPE raw_row_t 13 | #define SORT_CMP(x, y) compare((x).meta, (x).buffer, (y).buffer) 14 | #include "sort.h" 15 | 16 | int main(int argc, char **argv) { 17 | 18 | // setup bsv 19 | SETUP(); 20 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 21 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 22 | 23 | // setup state 24 | row_t row; 25 | raw_row_t raw_row; 26 | ARRAY_INIT(array, raw_row_t); 27 | 28 | // parse args 29 | bool reversed = false; 30 | ARGH_PARSE { 31 | ARGH_NEXT(); 32 | if ARGH_BOOL("-r", "--reversed") { reversed = true; } 33 | } 34 | 35 | i32 value_type; 36 | if (!ARGH_ARGC) 37 | if (reversed) 38 | value_type = R_STR; 39 | else 40 | value_type = STR; 41 | else { 42 | ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE); 43 | if (reversed) { 44 | if (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = R_I64; 45 | else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = R_I32; 46 | else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = R_I16; 47 | else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = R_U64; 48 | else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = R_U32; 49 | else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = R_U16; 50 | else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = R_F64; 51 | else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = R_F32; 52 | else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]); 53 | } else { 54 | if (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = I64; 55 | else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = I32; 56 | else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = I16; 57 | else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = U64; 58 | else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = U32; 59 | else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = U16; 60 | else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = F64; 61 | else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = F32; 62 | else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]); 63 | } 64 | } 65 | 66 | // read 67 | while (1) { 68 | load_next(&rbuf, &row, 0); 69 | if (row.stop) 70 | break; 71 | ASSERT_SIZE(value_type, row.sizes[0]); 72 | row_to_raw(&row, &raw_row); 73 | raw_row.meta = value_type; 74 | ARRAY_APPEND(array, raw_row, raw_row_t); 75 | } 76 | 77 | // sort 78 | row_tim_sort(array, array_size); 79 | 80 | // write 81 | for (i32 i = 0; i < array_size; i++) 82 | dump_raw(&wbuf, &array[i], 0); 83 | dump_flush(&wbuf, 0); 84 | } 85 | -------------------------------------------------------------------------------- /src/bsplit.c: -------------------------------------------------------------------------------- 1 | #include "read_ahead.h" 2 | #include "util.h" 3 | #include "load.h" 4 | #include "write.h" 5 | 6 | #define DESCRIPTION "split a stream into multiple files\n\n" 7 | #define USAGE "... | bsplit PREFIX [chunks_per_file=1] \n\n" 8 | #define EXAMPLE ">> echo -n a,b,c | bsv | bsplit prefix\nprefix_0000000000\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 15 | readaheadbuf_t rabuf = rabuf_init(1); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // setup state 19 | i32 i = 0; 20 | i32 j = 0; 21 | ASSERT(argc >= 2, "usage: %s", USAGE); 22 | u8 *prefix = argv[1]; 23 | u8 filename[1024]; 24 | FILE *f = NULL; 25 | i32 chunks_per_file = 1; 26 | row_t row; 27 | 28 | // parse args 29 | if (argc == 3) 30 | chunks_per_file = atol(argv[2]); 31 | 32 | // process input row by row 33 | while (1) { 34 | 35 | load_next(&rbuf, &row, 0); 36 | if (row.stop) 37 | break; 38 | 39 | // open and print next file if needed 40 | if (f == NULL) { 41 | memset(filename, 0, sizeof(filename)); 42 | SNPRINTF(filename, sizeof(filename), "%s_%010d", prefix, i++); 43 | FOPEN(f, filename, "wb"); 44 | FPRINTF(stdout, "%s\n", filename); 45 | } 46 | 47 | // write chunk 48 | FWRITE(&rbuf.chunk_size[0], sizeof(i32), f); 49 | FWRITE(rbuf.buffers[0], rbuf.chunk_size[0], f); 50 | read_goto_next_chunk(&rbuf, &rabuf, 0); 51 | 52 | // close file if needed 53 | if (++j % chunks_per_file == 0) { 54 | ASSERT(fclose(f) != EOF, "fatal: failed to close files\n"); 55 | f = NULL; 56 | } 57 | 58 | } 59 | 60 | // close last file if needed 61 | if (f) 62 | ASSERT(fclose(f) != EOF, "fatal: failed to close files\n"); 63 | 64 | } 65 | -------------------------------------------------------------------------------- /src/bsum.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "sum the first column\n\n" 6 | #define USAGE "... | bsum TYPE \n\n" 7 | #define EXAMPLE ">> echo '\n1\n2\n3\n4\n' | bsv | bschema a:i64 | bsum i64 | bschema i64:a | csv\n10\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | i64 sum_i64 = 0; 18 | i32 sum_i32 = 0; 19 | i16 sum_i16 = 0; 20 | u64 sum_u64 = 0; 21 | u32 sum_u32 = 0; 22 | u16 sum_u16 = 0; 23 | f64 sum_f64 = 0; 24 | f32 sum_f32 = 0; 25 | i32 value_type; 26 | row_t row; 27 | 28 | // parse args 29 | ASSERT(argc == 2, "usage: %s", USAGE); 30 | if (strcmp(argv[1], "i64") == 0) value_type = I64; 31 | else if (strcmp(argv[1], "i32") == 0) value_type = I32; 32 | else if (strcmp(argv[1], "i16") == 0) value_type = I16; 33 | else if (strcmp(argv[1], "u64") == 0) value_type = U64; 34 | else if (strcmp(argv[1], "u32") == 0) value_type = U32; 35 | else if (strcmp(argv[1], "u16") == 0) value_type = U16; 36 | else if (strcmp(argv[1], "f64") == 0) value_type = F64; 37 | else if (strcmp(argv[1], "f32") == 0) value_type = F32; 38 | else ASSERT(0, "fatal: bad type %s\n", argv[1]); 39 | 40 | // process input row by row 41 | while (1) { 42 | load_next(&rbuf, &row, 0); 43 | if (row.stop) 44 | break; 45 | ASSERT_SIZE(value_type, row.sizes[0]); 46 | switch (value_type) { 47 | case I64: sum_i64 += *(i64*)(row.columns[0]); break; 48 | case I32: sum_i32 += *(i32*)(row.columns[0]); break; 49 | case I16: sum_i16 += *(i16*)(row.columns[0]); break; 50 | case U64: sum_u64 += *(u64*)(row.columns[0]); break; 51 | case U32: sum_u32 += *(u32*)(row.columns[0]); break; 52 | case U16: sum_u16 += *(u16*)(row.columns[0]); break; 53 | case F64: sum_f64 += *(f64*)(row.columns[0]); break; 54 | case F32: sum_f32 += *(f32*)(row.columns[0]); break; 55 | } 56 | } 57 | 58 | // output sum 59 | row.max = 0; 60 | switch (value_type) { 61 | case I64: row.columns[0] = &sum_i64; row.sizes[0] = sizeof(i64); break; 62 | case I32: row.columns[0] = &sum_i32; row.sizes[0] = sizeof(i32); break; 63 | case I16: row.columns[0] = &sum_i16; row.sizes[0] = sizeof(i16); break; 64 | case U64: row.columns[0] = &sum_u64; row.sizes[0] = sizeof(u64); break; 65 | case U32: row.columns[0] = &sum_u32; row.sizes[0] = sizeof(u32); break; 66 | case U16: row.columns[0] = &sum_u16; row.sizes[0] = sizeof(u16); break; 67 | case F64: row.columns[0] = &sum_f64; row.sizes[0] = sizeof(f64); break; 68 | case F32: row.columns[0] = &sum_f32; row.sizes[0] = sizeof(f32); break; 69 | } 70 | dump(&wbuf, &row, 0); 71 | dump_flush(&wbuf, 0); 72 | } 73 | -------------------------------------------------------------------------------- /src/bsv.c: -------------------------------------------------------------------------------- 1 | #include "csv.h" 2 | #include "dump.h" 3 | 4 | #define DESCRIPTION "convert csv to bsv\n\n" 5 | #define USAGE "... | bsv\n\n" 6 | #define EXAMPLE ">> echo a,b,c | bsv | bcut 3,2,1 | csv\nc,b,a\n" 7 | 8 | int main(int argc, char **argv) { 9 | 10 | // setup bsv 11 | SETUP(); 12 | 13 | // setup input 14 | CSV_INIT(); 15 | 16 | // setup output 17 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 18 | 19 | // setup state 20 | row_t row; 21 | 22 | // process input row by row 23 | while (1) { 24 | CSV_READ_LINE(stdin); 25 | if (csv_stop) 26 | break; 27 | if (csv_max > 0 || csv_sizes[0] > 0) { 28 | row.max = csv_max; 29 | for (i32 i = 0; i <= row.max; i++) { 30 | row.columns[i] = csv_columns[i]; 31 | row.sizes[i] = csv_sizes[i]; 32 | } 33 | dump(&wbuf, &row, 0); 34 | } 35 | } 36 | dump_flush(&wbuf, 0); 37 | } 38 | -------------------------------------------------------------------------------- /src/btake.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "load.h" 3 | #include "dump.h" 4 | 5 | #define DESCRIPTION "take while the first column is VALUE\n\n" 6 | #define USAGE "... | btake VALUE\n\n" 7 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | bdropntil c | btake c | csv\nc\n\n" 8 | 9 | int main(int argc, char **argv) { 10 | 11 | // setup bsv 12 | SETUP(); 13 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 14 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 15 | 16 | // setup state 17 | row_t row; 18 | u8 *val = argv[1]; 19 | 20 | // process input row by row 21 | while (1) { 22 | load_next(&rbuf, &row, 0); 23 | if (row.stop) 24 | break; 25 | if (compare_str(row.columns[0], val) != 0) 26 | break; 27 | dump(&wbuf, &row, 0); 28 | } 29 | dump_flush(&wbuf, 0); 30 | } 31 | -------------------------------------------------------------------------------- /src/btakeuntil.c: -------------------------------------------------------------------------------- 1 | #include "read_ahead.h" 2 | #include "util.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | 6 | #define DESCRIPTION "for sorted input, take until the first column is gte to VALUE\n\n" 7 | #define USAGE "... | btakeuntil VALUE [TYPE]\n\n" 8 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | btakeuntil c | csv\na\nb\n\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 15 | readaheadbuf_t rabuf = rabuf_init(1); 16 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 17 | 18 | // setup state 19 | bool done_skipping = false; 20 | bool matched = false; 21 | i32 cmp; 22 | row_t row; 23 | ASSERT(argc >= 2, "usage: %s", USAGE); 24 | i64 val_i64; 25 | i32 val_i32; 26 | i16 val_i16; 27 | u64 val_u64; 28 | u32 val_u32; 29 | u16 val_u16; 30 | f64 val_f64; 31 | f32 val_f32; 32 | void *val; 33 | i32 value_type; 34 | if (argc == 2) { 35 | val = argv[1]; 36 | value_type = STR; 37 | } else { 38 | ASSERT(argc == 3, "usage: %s", USAGE); 39 | if (strcmp(argv[2], "i64") == 0) { value_type = I64; val_i64 = atol(argv[1]); val = &val_i64; } 40 | else if (strcmp(argv[2], "i32") == 0) { value_type = I32; val_i32 = atol(argv[1]); val = &val_i32; } 41 | else if (strcmp(argv[2], "i16") == 0) { value_type = I16; val_i16 = atol(argv[1]); val = &val_i16; } 42 | else if (strcmp(argv[2], "u64") == 0) { value_type = U64; val_u64 = atol(argv[1]); val = &val_u64; } 43 | else if (strcmp(argv[2], "u32") == 0) { value_type = U32; val_u32 = atol(argv[1]); val = &val_u32; } 44 | else if (strcmp(argv[2], "u16") == 0) { value_type = U16; val_u16 = atol(argv[1]); val = &val_u16; } 45 | else if (strcmp(argv[2], "f64") == 0) { value_type = F64; val_f64 = atof(argv[1]); val = &val_f64; } 46 | else if (strcmp(argv[2], "f32") == 0) { value_type = F32; val_f32 = atof(argv[1]); val = &val_f32; } 47 | else ASSERT(0, "fatal: bad type %s\n", argv[2]); 48 | } 49 | 50 | // process input row by row 51 | while (1) { 52 | load_next(&rbuf, &row, 0); 53 | if (row.stop) { // ----------------------------------------------- reached the last chunk and possibly need to backup to the previous chunk to find a match 54 | if (done_skipping) { // -------------------------------------- already gone back to the previous chunk, time to stop 55 | break; 56 | } else { // -------------------------------------------------- go back and check the entire last chunk for a match 57 | read_goto_last_chunk(&rbuf, &rabuf, 0); 58 | done_skipping = true; 59 | } 60 | } else { // ------------------------------------------------------ reading data chunk by chunk, checking the first row and the proceeding to the next chunk 61 | ASSERT_SIZE(value_type, row.sizes[0]); 62 | cmp = compare(value_type, row.columns[0], val); // ------------------- check for a match 63 | if (done_skipping) { // -------------------------------------- since we are done skipping ahead by chunks, check every row for a match 64 | if (cmp >= 0) // ----------------------------------------- found a match, time to stop 65 | break; 66 | dump(&wbuf, &row, 0); // --------------------------------- otherwise dump the row 67 | } else if (cmp < 0) { // ------------------------------------- we aren't done skipping ahead, we want to keep skipping until we've gone too far 68 | if (rabuf.has_nexted) { // ------------------------------- write the entire last chunk since we know all of it's rows are not a match 69 | memcpy(wbuf.buffer[0], rabuf.last_buffers[0], rabuf.last_chunk_size[0]); 70 | wbuf.offset[0] = rabuf.last_chunk_size[0]; 71 | write_flush(&wbuf, 0); 72 | } 73 | read_goto_next_chunk(&rbuf, &rabuf, 0); 74 | } else { // -------------------------------------------------- we've gone too far, time to backup one chunk and start checking every row 75 | read_goto_last_chunk(&rbuf, &rabuf, 0); 76 | done_skipping = true; 77 | } 78 | } 79 | } 80 | dump_flush(&wbuf, 0); 81 | } 82 | -------------------------------------------------------------------------------- /src/btopn.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "argh.h" 3 | #include "array.h" 4 | #include "load.h" 5 | #include "dump.h" 6 | 7 | #define HEAP_COMPARE(meta, x, y) compare(meta, ((raw_row_t*)x)->buffer, ((raw_row_t*)y)->buffer) > 0 8 | #include "heap.h" 9 | 10 | #define DESCRIPTION "accumulate the top n rows in a heap by first column value\n\n" 11 | #define USAGE "... | btopn N [TYPE] [-r|--reversed]\n\n" 12 | #define EXAMPLE ">> echo '\n1\n3\n2\n' | bsv | bschema a:i64 | btopn 2 i64 | bschema i64:a | csv\n3\n2\n\n" 13 | 14 | int main(int argc, char **argv) { 15 | 16 | // setup bsv 17 | SETUP(); 18 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 19 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 20 | 21 | // parse args 22 | bool reversed = false; 23 | ARGH_PARSE { 24 | ARGH_NEXT(); 25 | if ARGH_BOOL("-r", "--reversed") { reversed = true; } 26 | } 27 | ASSERT(ARGH_ARGC >= 1, "usage: %s", USAGE); 28 | ASSERT(isdigits(ARGH_ARGV[0]), "usage: %s", USAGE); 29 | i32 top_n = atol(ARGH_ARGV[0]); 30 | i32 value_type; 31 | if (ARGH_ARGC == 1) 32 | if (reversed) 33 | value_type = R_STR; 34 | else 35 | value_type = STR; 36 | else { 37 | ASSERT(ARGH_ARGC == 2, "usage: %s", USAGE); 38 | if (reversed) { 39 | if (strcmp(ARGH_ARGV[1], "i64") == 0) value_type = R_I64; 40 | else if (strcmp(ARGH_ARGV[1], "i32") == 0) value_type = R_I32; 41 | else if (strcmp(ARGH_ARGV[1], "i16") == 0) value_type = R_I16; 42 | else if (strcmp(ARGH_ARGV[1], "u64") == 0) value_type = R_U64; 43 | else if (strcmp(ARGH_ARGV[1], "u32") == 0) value_type = R_U32; 44 | else if (strcmp(ARGH_ARGV[1], "u16") == 0) value_type = R_U16; 45 | else if (strcmp(ARGH_ARGV[1], "f64") == 0) value_type = R_F64; 46 | else if (strcmp(ARGH_ARGV[1], "f32") == 0) value_type = R_F32; 47 | else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[1]); 48 | } else { 49 | if (strcmp(ARGH_ARGV[1], "i64") == 0) value_type = I64; 50 | else if (strcmp(ARGH_ARGV[1], "i32") == 0) value_type = I32; 51 | else if (strcmp(ARGH_ARGV[1], "i16") == 0) value_type = I16; 52 | else if (strcmp(ARGH_ARGV[1], "u64") == 0) value_type = U64; 53 | else if (strcmp(ARGH_ARGV[1], "u32") == 0) value_type = U32; 54 | else if (strcmp(ARGH_ARGV[1], "u16") == 0) value_type = U16; 55 | else if (strcmp(ARGH_ARGV[1], "f64") == 0) value_type = F64; 56 | else if (strcmp(ARGH_ARGV[1], "f32") == 0) value_type = F32; 57 | else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[1]); 58 | } 59 | } 60 | 61 | // setup state 62 | row_t row; 63 | raw_row_t *raw_row; 64 | heap_t h = {0}; 65 | h.meta = value_type; 66 | 67 | // process input row by row 68 | while (1) { 69 | load_next(&rbuf, &row, 0); 70 | if (row.stop) 71 | break; 72 | ASSERT_SIZE(value_type, row.sizes[0]); 73 | MALLOC(raw_row, sizeof(raw_row_t)); 74 | row_to_raw_malloc(&row, raw_row); 75 | heap_insert(&h, raw_row); 76 | if (h.size > top_n * 128) // amortize truncation cost, 128 is abitrary 77 | heap_truncate(&h, top_n); 78 | } 79 | 80 | // dump output 81 | i32 i = top_n; 82 | while (i--) { 83 | if (!h.size) 84 | break; 85 | raw_row = (raw_row_t*)h.nodes[0]; 86 | dump_raw(&wbuf, raw_row, 0); 87 | heap_delete(&h); 88 | } 89 | dump_flush(&wbuf, 0); 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/bunzip.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "argh.h" 3 | #include "load.h" 4 | #include "dump.h" 5 | 6 | #define DESCRIPTION "split a multi column input into single column outputs\n\n" 7 | #define USAGE "... | bunzip PREFIX [-l|--lz4]\n\n" 8 | #define EXAMPLE ">> echo '\na,b,c\n1,2,3\n' | bsv | bunzip col && echo col_1 col_3 | bzip | csv\na,c\n1,3\n" 9 | 10 | int main(int argc, char **argv) { 11 | 12 | // setup bsv 13 | SETUP(); 14 | 15 | // setup input 16 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 17 | 18 | // setup state 19 | u8 num_columns_str[16]; 20 | u8 path[1024]; 21 | u8 *prefix; 22 | row_t row; 23 | row_t new; 24 | new.max = 0; 25 | 26 | // parse args 27 | bool lz4 = false; 28 | ARGH_PARSE { 29 | ARGH_NEXT(); 30 | if ARGH_BOOL("-l", "--lz4") { lz4 = true; } 31 | } 32 | ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE); 33 | prefix = ARGH_ARGV[0]; 34 | 35 | // read first row to find the number of columns 36 | load_next(&rbuf, &row, 0); 37 | if (row.stop) 38 | exit(0); 39 | i32 unzip_max = row.max; 40 | 41 | // open output files 42 | FILE *files[unzip_max + 1]; 43 | SNPRINTF(num_columns_str, sizeof(num_columns_str), "%d", unzip_max + 1); 44 | for (i32 i = 0; i <= unzip_max; i++) { 45 | SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_columns_str), i + 1); 46 | FOPEN(files[i], path, "wb"); 47 | } 48 | 49 | // setup output 50 | writebuf_t wbuf = wbuf_init(files, unzip_max + 1, lz4); 51 | 52 | // output first row 53 | for (i32 i = 0; i <= unzip_max; i++) { 54 | new.sizes[0] = row.sizes[i]; 55 | new.columns[0] = row.columns[i]; 56 | dump(&wbuf, &new, i); 57 | } 58 | 59 | // load the next row in case we need to stop 60 | load_next(&rbuf, &row, 0); 61 | 62 | // process the rest of input row by row 63 | while (!row.stop) { 64 | ASSERT(row.max == unzip_max, "fatal: unzip found a bad row, needed max %d, got: %d\n", unzip_max, row.max); 65 | for (i32 i = 0; i <= unzip_max; i++) { 66 | new.sizes[0] = row.sizes[i]; 67 | new.columns[0] = row.columns[i]; 68 | dump(&wbuf, &new, i); 69 | } 70 | load_next(&rbuf, &row, 0); 71 | } 72 | 73 | // flush and close 74 | for (i32 i = 0; i <= unzip_max; i++) { 75 | dump_flush(&wbuf, i); 76 | ASSERT(fclose(files[i]) != EOF, "fatal: failed to close files\n"); 77 | SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_columns_str), i + 1); 78 | FPRINTF(stdout, "%s\n", path); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/bzip.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "argh.h" 3 | #include "load.h" 4 | #include "array.h" 5 | #include "dump.h" 6 | 7 | #define DESCRIPTION "combine single column inputs into a multi column output\n\n" 8 | #define USAGE "ls column_* | bzip [COL1,...COLN] [-l|--lz4]\n\n" 9 | #define EXAMPLE ">> echo '\na,b,c\n1,2,3\n' | bsv | bunzip column && ls column_* | bzip 1,3 | csv\na,c\n1,3\n" 10 | 11 | int main(int argc, char **argv) { 12 | 13 | // setup bsv 14 | SETUP(); 15 | 16 | // parse args 17 | bool lz4 = false; 18 | ARGH_PARSE { 19 | ARGH_NEXT(); 20 | if ARGH_BOOL("-l", "--lz4") { lz4 = true; } 21 | } 22 | 23 | // setup input, filenames come in on stdin 24 | ARRAY_INIT(files, FILE*); 25 | ARRAY_INIT(filename, u8); 26 | u8 tmp; 27 | FILE* file; 28 | i32 size; 29 | while (1) { 30 | size = fread_unlocked(&tmp, 1, 1, stdin); 31 | if (size != 1) 32 | break; 33 | if (tmp == '\n' || tmp == ' ') { 34 | if (ARRAY_SIZE(filename) > 0) { 35 | ARRAY_APPEND(filename, '\0', u8); 36 | FOPEN(file, filename, "rb"); 37 | ARRAY_APPEND(files, file, FILE*); 38 | ARRAY_RESET(filename); 39 | } 40 | } else { 41 | ARRAY_APPEND(filename, tmp, u8); 42 | } 43 | } 44 | readbuf_t rbuf = rbuf_init(files, ARRAY_SIZE(files), lz4); 45 | 46 | // parse selection 47 | ARRAY_INIT(selected, i32); 48 | u8 *f; 49 | i32 column; 50 | switch (ARGH_ARGC) { 51 | // default is all columns 52 | case 0: 53 | for (i32 i = 0; i < ARRAY_SIZE(files); i++) 54 | ARRAY_APPEND(selected, i, i32); 55 | break; 56 | // otherwise choose columns 57 | case 1: 58 | while ((f = strsep(&ARGH_ARGV[0], ","))) { 59 | column = atoi(f); 60 | ASSERT(column > 0, "fatal: bad column selection, should be like: '1,2,3' and cannot select below column 1.\n"); 61 | ASSERT(column <= ARRAY_SIZE(files), "fatal: bad column selection, should be like: '1,2,3' and cannot select above column %d.\n", ARRAY_SIZE(files)); 62 | ARRAY_APPEND(selected, column - 1, i32); 63 | } 64 | i32 used[ARRAY_SIZE(files)]; 65 | for (i32 i = 0; i < ARRAY_SIZE(files); i++) 66 | used[i] = -1; 67 | for (i32 i = 0; i < ARRAY_SIZE(selected); i++) { 68 | ASSERT(used[selected[i]] == -1, "fatal: can only select columns once, got dupe for column: %d\n", selected[i] + 1); 69 | used[selected[i]] = 1; 70 | } 71 | break; 72 | } 73 | 74 | // setup state 75 | row_t row; 76 | row_t new; 77 | new.max = ARRAY_SIZE(selected) - 1; 78 | i32 stops[ARRAY_SIZE(selected)]; 79 | i32 do_stop[ARRAY_SIZE(selected)]; 80 | i32 dont_stop[ARRAY_SIZE(selected)]; 81 | for (i32 i = 0; i < ARRAY_SIZE(selected); i++) { 82 | do_stop[i] = 1; 83 | dont_stop[i] = 0; 84 | } 85 | 86 | // setup output 87 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false); 88 | 89 | // process input row by row 90 | while (1) { 91 | for (i32 i = 0; i < ARRAY_SIZE(selected); i++) { 92 | load_next(&rbuf, &row, selected[i]); 93 | ASSERT(row.max == 0, "fatal: tried to zip a row with more than 1 column\n"); 94 | new.sizes[i] = row.sizes[0]; 95 | new.columns[i] = row.columns[0]; 96 | stops[i] = row.stop; 97 | } 98 | if (memcmp(stops, dont_stop, ARRAY_SIZE(selected) * sizeof(i32)) != 0) { 99 | ASSERT(memcmp(stops, do_stop, ARRAY_SIZE(selected) * sizeof(i32)) == 0, "fatal: all columns didn't end at the same length\n"); 100 | break; 101 | } 102 | dump(&wbuf, &new, 0); 103 | } 104 | dump_flush(&wbuf, 0); 105 | 106 | } 107 | -------------------------------------------------------------------------------- /src/csv.c: -------------------------------------------------------------------------------- 1 | #include "load.h" 2 | #include "write_simple.h" 3 | 4 | #define DESCRIPTION "convert bsv to csv\n\n" 5 | #define USAGE "... | csv\n\n" 6 | #define EXAMPLE ">> echo a,b,c | bsv | csv\na,b,c\n" 7 | 8 | int main(int argc, char **argv) { 9 | 10 | // setup bsv 11 | SETUP(); 12 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false); 13 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1); 14 | 15 | // setup state 16 | row_t row; 17 | i32 ran = 0; 18 | 19 | // process input row by row 20 | while (1) { 21 | load_next(&rbuf, &row, 0); 22 | if (row.stop) 23 | break; 24 | for (i32 i = 0; i <= row.max; i++) { 25 | write_bytes(&wbuf, row.columns[i], row.sizes[i], 0); 26 | if (i != row.max) 27 | write_bytes(&wbuf, ",", 1, 0); 28 | } 29 | write_bytes(&wbuf, "\n", 1, 0); 30 | ran = 1; 31 | } 32 | if (ran == 0) 33 | write_bytes(&wbuf, "\n", 1, 0); 34 | write_flush(&wbuf, 0); 35 | } 36 | -------------------------------------------------------------------------------- /src/xxh3.c: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include "read_simple.h" 3 | #include "write_simple.h" 4 | #include "argh.h" 5 | #include "xxh3.h" 6 | 7 | #define DESCRIPTION "xxh3_64 hash stdin\n\n" 8 | #define USAGE "... | xxh3 [--stream|--int]\n\n" 9 | #define EXAMPLE \ 10 | " --stream pass stdin through to stdout with hash on stderr\n\n" \ 11 | " --int output hash as int not hash\n\n" \ 12 | ">> echo abc | xxh3\n079364cbfdf9f4cb\n" 13 | 14 | int main(int argc, char **argv) { 15 | 16 | // setup bsv 17 | SETUP(); 18 | readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1); 19 | writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1); 20 | 21 | // parse args 22 | bool int_out = false; 23 | bool stream = false; 24 | ARGH_PARSE { 25 | ARGH_NEXT(); 26 | if ARGH_BOOL("-i", "--int") { int_out = true; } 27 | else if ARGH_BOOL("-s", "--stream") { stream = true; } 28 | } 29 | 30 | // setup state 31 | XXH3_state_t state; 32 | ASSERT(XXH3_64bits_reset(&state) != XXH_ERROR, "xxh3 reset failed\n"); 33 | 34 | // process input row by row 35 | while (1) { 36 | read_bytes(&rbuf, BUFFER_SIZE, 0); 37 | ASSERT(XXH3_64bits_update(&state, rbuf.buffer, rbuf.bytes) != XXH_ERROR, "xxh3 update failed\n"); 38 | if (stream) 39 | write_bytes(&wbuf, rbuf.buffer, rbuf.bytes, 0); 40 | if (BUFFER_SIZE != rbuf.bytes) 41 | break; 42 | } 43 | 44 | // 45 | u64 hash = XXH3_64bits_digest(&state); 46 | FILE *out = (stream) ? stderr : stdout; 47 | if (int_out) 48 | FPRINTF(out, "%lu\n", hash); 49 | else 50 | FPRINTF(out, "%08x%08x\n", (i32)(hash>>32), (i32)hash); 51 | if (stream) 52 | write_flush(&wbuf, 0); 53 | } 54 | -------------------------------------------------------------------------------- /test/_queue_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | import queue 4 | from hypothesis import given, settings 5 | from hypothesis.strategies import composite, integers, sampled_from, randoms 6 | from test_util import run, clone_source 7 | 8 | def setup_module(m): 9 | m.tempdir = clone_source() 10 | m.orig = os.getcwd() 11 | m.path = os.environ['PATH'] 12 | os.chdir(m.tempdir) 13 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 14 | shell.run('make clean', stream=True) 15 | shell.run('make _queue') 16 | 17 | def teardown_module(): 18 | with shell.climb_git_root(): 19 | shell.run('make clean', stream=True) 20 | 21 | @composite 22 | def inputs(draw): 23 | capacity = draw(integers(min_value=1, max_value=16)) 24 | num_actions = draw(integers(min_value=0, max_value=256)) 25 | rand = draw(randoms()) 26 | actions = [] 27 | for _ in range(num_actions): 28 | possible_actions = [ 29 | f'put {rand.randint(0, 999)}', 30 | 'get', 31 | ] 32 | actions.append(draw(sampled_from(possible_actions))) 33 | return capacity, actions 34 | 35 | def expected(arg): 36 | capacity, actions = arg 37 | res = [] 38 | q = queue.Queue(capacity) 39 | for action in actions: 40 | if action == 'get': 41 | try: 42 | res.append(q.get_nowait()) 43 | except queue.Empty: 44 | res.append('empty') 45 | elif action.split()[0] == 'put': 46 | try: 47 | q.put_nowait(action.split()[1]) 48 | except queue.Full: 49 | res.append('full') 50 | else: 51 | assert False, action 52 | return '\n'.join(map(str, res)) 53 | 54 | @given(inputs()) 55 | @settings(max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) 56 | def test_props(arg): 57 | capacity, actions = arg 58 | result = expected(arg) 59 | assert result == run('\n'.join(actions) + '\n', '_queue', capacity).strip() 60 | -------------------------------------------------------------------------------- /test/bcat_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | from test_util import unindent, rm_whitespace, clone_source 4 | 5 | def setup_module(m): 6 | m.tempdir = clone_source() 7 | m.orig = os.getcwd() 8 | m.path = os.environ['PATH'] 9 | os.chdir(m.tempdir) 10 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 11 | shell.run('make clean && make bsv csv bcat', stream=True) 12 | 13 | def teardown_module(m): 14 | os.chdir(m.orig) 15 | os.environ['PATH'] = m.path 16 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 17 | shell.run('rm -rf', m.tempdir) 18 | 19 | def test_basic(): 20 | with shell.tempdir(): 21 | shell.run('for char in a a b b c c; do echo $char | bsv >> $char; done') 22 | stdout = """ 23 | a:a 24 | b:b 25 | c:c 26 | """ 27 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 1 a b c') 28 | stdout = """ 29 | a:a 30 | a:a 31 | b:b 32 | b:b 33 | c:c 34 | c:c 35 | """ 36 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 2 a b c') 37 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 2 --prefix a b c') 38 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix a b c') 39 | stdout = """ 40 | a 41 | b 42 | c 43 | """ 44 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 1 a b c') 45 | stdout = """ 46 | a 47 | a 48 | b 49 | b 50 | c 51 | c 52 | """ 53 | assert rm_whitespace(unindent(stdout)) == shell.run('bcat a b c') 54 | -------------------------------------------------------------------------------- /test/bcombine_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import shell 4 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 5 | 6 | def setup_module(m): 7 | m.tempdir = clone_source() 8 | m.orig = os.getcwd() 9 | m.path = os.environ['PATH'] 10 | os.chdir(m.tempdir) 11 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 12 | shell.run('make clean && make bsv csv bcombine', stream=True) 13 | 14 | def teardown_module(m): 15 | os.chdir(m.orig) 16 | os.environ['PATH'] = m.path 17 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 18 | shell.run('rm -rf', m.tempdir) 19 | 20 | def test_basic1(): 21 | stdin = """ 22 | a,b,c,d 23 | 1,2,3 24 | x,y 25 | """ 26 | stdout = """ 27 | a:b,a,b,c,d 28 | 1:2,1,2,3 29 | x:y,x,y 30 | """ 31 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcombine 1,2 | csv') 32 | 33 | def test_basic2(): 34 | stdin = """ 35 | a,b,c,d 36 | 1,2,3 37 | x,y 38 | """ 39 | stdout = """ 40 | b:a,a,b,c,d 41 | 2:1,1,2,3 42 | y:x,x,y 43 | """ 44 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcombine 2,1 | csv') 45 | 46 | def test_basic3(): 47 | stdin = """ 48 | a,b,c,d 49 | 1,2,3 50 | x 51 | """ 52 | with pytest.raises(Exception): 53 | run(rm_whitespace(stdin), 'bsv | bcombine 2,1 | csv') 54 | -------------------------------------------------------------------------------- /test/bcounteach_hash_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import collections 4 | import string 5 | import shell 6 | from hypothesis.database import ExampleDatabase 7 | from hypothesis import given, settings, HealthCheck 8 | from hypothesis.strategies import text, lists, composite, integers, randoms, sampled_from 9 | from test_util import run, rm_whitespace, rm_whitespace, clone_source, compile_buffer_sizes 10 | 11 | if os.environ.get('TEST_FACTOR'): 12 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 13 | else: 14 | buffers = [128] 15 | 16 | def setup_module(m): 17 | m.tempdir = clone_source() 18 | m.orig = os.getcwd() 19 | m.path = os.environ['PATH'] 20 | os.chdir(m.tempdir) 21 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 22 | shell.run('make clean', stream=True) 23 | compile_buffer_sizes('csv', buffers) 24 | compile_buffer_sizes('bsv', buffers) 25 | compile_buffer_sizes('bsort', buffers) 26 | compile_buffer_sizes('bschema', buffers) 27 | compile_buffer_sizes('bcounteach-hash', buffers) 28 | shell.run('make bsv csv bsort bschema bcounteach-hash', stream=True) 29 | 30 | def teardown_module(m): 31 | os.chdir(m.orig) 32 | os.environ['PATH'] = m.path 33 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 34 | shell.run('rm -rf', m.tempdir) 35 | 36 | @composite 37 | def inputs(draw): 38 | buffer = draw(sampled_from(buffers)) 39 | random = draw(randoms()) 40 | num_columns = draw(integers(min_value=1, max_value=4)) 41 | max_repeats = draw(integers(min_value=1, max_value=3)) 42 | column = text(string.ascii_lowercase, min_size=1, max_size=20) 43 | line = lists(column, min_size=num_columns, max_size=num_columns) 44 | lines = draw(lists(line)) 45 | lines = [','.join(x) for x in lines] 46 | lines = [l 47 | for line in lines 48 | for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))] 49 | return buffer, '\n'.join(lines) + '\n' 50 | 51 | def expected(csv): 52 | lines = csv.splitlines() 53 | lines = [x.split(',')[0] for x in lines] 54 | counts = collections.Counter(lines) 55 | return '\n'.join(f'{k},{counts[k]}' for k in sorted(counts) if k) + '\n' 56 | 57 | @given(inputs()) 58 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), suppress_health_check=HealthCheck.all()) # type: ignore 59 | def test_props(args): 60 | buffer, csv = args 61 | result = expected(csv) 62 | assert result == run(csv, f'bsv.{buffer} | bcounteach-hash.{buffer} | bschema.{buffer} *,i64:a | bsort.{buffer} | csv.{buffer}') 63 | 64 | def test_basic(): 65 | stdin = """ 66 | a 67 | a 68 | a 69 | b 70 | b 71 | a 72 | """ 73 | stdout = """ 74 | a,4 75 | b,2 76 | """ 77 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcounteach-hash | bschema *,i64:a | bsort | csv') 78 | -------------------------------------------------------------------------------- /test/bcounteach_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings, HealthCheck 6 | from hypothesis.strategies import text, lists, composite, integers, randoms 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bschema bcounteach', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | random = draw(randoms()) 26 | num_columns = draw(integers(min_value=1, max_value=4)) 27 | max_repeats = draw(integers(min_value=1, max_value=3)) 28 | column = text(string.ascii_lowercase, min_size=1, max_size=64) 29 | line = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(line)) 31 | lines = [','.join(x) for x in lines] 32 | lines = [l 33 | for line in lines 34 | for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))] 35 | return '\n'.join(lines) + '\n' 36 | 37 | def expected(csv): 38 | lines = csv.splitlines() 39 | lines = [x.split(',')[0] for x in lines] 40 | result = [] 41 | count = 0 42 | last = None 43 | for line in lines: 44 | if line: 45 | if last is not None and last != line: 46 | result.append(f'{last},{count}') 47 | count = 0 48 | last = line 49 | count += 1 50 | if last: 51 | result.append(f'{last},{count}') 52 | return '\n'.join(result) + '\n' 53 | 54 | @given(inputs()) 55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), suppress_health_check=HealthCheck.all()) # type: ignore 56 | def test_props(args): 57 | csv = args 58 | result = expected(csv) 59 | assert result == run(csv, 'bsv | bcounteach | bschema *,i64:a | csv') 60 | 61 | def test_basic(): 62 | stdin = """ 63 | a 64 | a 65 | a 66 | b 67 | b 68 | a 69 | """ 70 | stdout = """ 71 | a,3 72 | b,2 73 | a,1 74 | """ 75 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcounteach | bschema *,i64:a | csv') 76 | -------------------------------------------------------------------------------- /test/bcountrows_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 4 | 5 | def setup_module(m): 6 | m.tempdir = clone_source() 7 | m.orig = os.getcwd() 8 | m.path = os.environ['PATH'] 9 | os.chdir(m.tempdir) 10 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 11 | shell.run('make clean && make bsv csv bschema bcountrows', stream=True) 12 | 13 | def teardown_module(m): 14 | os.chdir(m.orig) 15 | os.environ['PATH'] = m.path 16 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 17 | shell.run('rm -rf', m.tempdir) 18 | 19 | def test_basic(): 20 | stdin = """ 21 | a 22 | a 23 | a 24 | b 25 | b 26 | a 27 | """ 28 | stdout = """ 29 | 6 30 | """ 31 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcountrows | bschema i64:a | csv') 32 | -------------------------------------------------------------------------------- /test/bdedupe_hash_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import text, lists, composite, integers, randoms 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bdedupe-hash', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | random = draw(randoms()) 26 | num_columns = draw(integers(min_value=1, max_value=8)) 27 | max_repeats = draw(integers(min_value=1, max_value=3)) 28 | column = text(string.ascii_lowercase, min_size=1, max_size=64) 29 | line = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(line)) 31 | lines = [','.join(x) for x in lines] 32 | lines = [l 33 | for line in lines 34 | for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))] 35 | return '\n'.join(lines) + '\n' 36 | 37 | def expected(csv): 38 | lines = csv.splitlines() 39 | seen = set() 40 | result = [] 41 | for line in lines: 42 | key = line.split(',')[0] 43 | if key not in seen: 44 | seen.add(key) 45 | result.append(line) 46 | return '\n'.join(result) + '\n' 47 | 48 | @given(inputs()) 49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 50 | def test_props(args): 51 | csv = args 52 | result = expected(csv) 53 | assert result == run(csv, 'bsv | bdedupe-hash | csv') 54 | 55 | def test_basic(): 56 | stdin = """ 57 | a,1 58 | a,2 59 | a,3 60 | b,4 61 | b,5 62 | a,6 63 | a,7 64 | """ 65 | stdout = """ 66 | a,1 67 | b,4 68 | """ 69 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bdedupe-hash | csv') 70 | -------------------------------------------------------------------------------- /test/bdedupe_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import text, lists, composite, integers, randoms 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bdedupe', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | random = draw(randoms()) 26 | num_columns = draw(integers(min_value=1, max_value=8)) 27 | max_repeats = draw(integers(min_value=1, max_value=3)) 28 | column = text(string.ascii_lowercase, min_size=1, max_size=64) 29 | line = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(line)) 31 | lines = [','.join(x) for x in lines] 32 | lines = [l 33 | for line in lines 34 | for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))] 35 | return '\n'.join(lines) + '\n' 36 | 37 | def expected(csv): 38 | lines = csv.splitlines() 39 | result = [] 40 | for line in lines: 41 | if not result or result[-1].split(',')[0] != line.split(',')[0]: 42 | result.append(line) 43 | return '\n'.join(result) + '\n' 44 | 45 | @given(inputs()) 46 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 47 | def test_props(args): 48 | csv = args 49 | result = expected(csv) 50 | assert result == run(csv, 'bsv | bdedupe | csv') 51 | 52 | def test_basic(): 53 | stdin = """ 54 | a,1 55 | a,2 56 | a,3 57 | b,4 58 | b,5 59 | a,6 60 | a,7 61 | """ 62 | stdout = """ 63 | a,1 64 | b,4 65 | a,6 66 | """ 67 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bdedupe | csv') 68 | -------------------------------------------------------------------------------- /test/bdropuntil_i64_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | import string 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text 7 | from test_util import run, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bschema bsort bdropuntil', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | r = draw(randoms()) 26 | num_columns = draw(integers(min_value=1, max_value=4)) 27 | column = integers(min_value=-9223372036854775806, max_value=9223372036854775806) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line, min_size=1)) 30 | lines = [[str(x) for x in line] for line in lines] 31 | first_column_values = [line[0] for line in lines] 32 | threshold = draw(floats(min_value=0, max_value=1)) 33 | for line in lines: 34 | if line and r.random() > threshold: 35 | line[0] = r.choice(first_column_values) 36 | csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n' 37 | value = r.choice(first_column_values) 38 | return value, csv 39 | 40 | def expected(value, csv): 41 | value = int(value) 42 | res = [] 43 | found = False 44 | lines = csv.splitlines() 45 | lines = [[int(x) for x in l.split(',')] for l in lines] 46 | lines = sorted(lines) 47 | for cols in lines: 48 | if found: 49 | res.append(cols[0]) 50 | else: 51 | if cols: 52 | if cols[0] >= value: 53 | res.append(cols[0]) 54 | found = True 55 | return '\n'.join(map(str, res)) + '\n' 56 | 57 | @given(inputs()) 58 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 59 | def test_props(args): 60 | value, csv = args 61 | result = expected(value, csv) 62 | assert result.splitlines() == run(csv, f'bsv | bschema a:i64,... | bsort i64 | bdropuntil "{value}" i64 | bschema i64:a | csv').splitlines() 63 | -------------------------------------------------------------------------------- /test/bdropuntil_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | import random 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, randoms, floats, sampled_from 8 | from test_util import run, clone_source, compile_buffer_sizes 9 | 10 | if os.environ.get('TEST_FACTOR'): 11 | buffers = list(sorted(set([12, 17, 64, 256, 1024, 1024 * 1024 * 5] + [random.randint(8, 1024) for _ in range(10)]))) 12 | else: 13 | buffers = [128] 14 | 15 | def setup_module(m): 16 | m.tempdir = clone_source() 17 | m.orig = os.getcwd() 18 | m.path = os.environ['PATH'] 19 | os.chdir(m.tempdir) 20 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 21 | shell.run('make clean', stream=True) 22 | compile_buffer_sizes('csv', buffers) 23 | compile_buffer_sizes('bsv', buffers) 24 | compile_buffer_sizes('bsort', buffers) 25 | compile_buffer_sizes('bdropuntil', buffers) 26 | shell.run('make bsv csv bsort bdropuntil', stream=True) 27 | 28 | def teardown_module(m): 29 | os.chdir(m.orig) 30 | os.environ['PATH'] = m.path 31 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 32 | shell.run('rm -rf', m.tempdir) 33 | 34 | def partition(r, n, x): 35 | res = [] 36 | ks = list(sorted({r.randint(1, max(1, len(x))) for _ in range(n)})) 37 | ks = [0] + ks 38 | ks[-1] = len(x) 39 | for a, b in zip(ks, ks[1:]): 40 | res.append(x[a:b]) 41 | return res 42 | 43 | @composite 44 | def inputs(draw): 45 | r = draw(randoms()) 46 | buffer = draw(sampled_from(buffers)) 47 | num_text_columns = draw(integers(min_value=1, max_value=4)) 48 | text_column = text(string.ascii_lowercase, min_size=1, max_size=8) 49 | text_line = lists(text_column, min_size=num_text_columns, max_size=num_text_columns) 50 | lines = draw(lists(text_line, min_size=1)) 51 | first_column_values = [line[0] for line in lines] 52 | threshold = draw(floats(min_value=0, max_value=1)) 53 | for line in lines: 54 | if line and r.random() > threshold: 55 | line[0] = r.choice(first_column_values) 56 | csv = '\n'.join([','.join(l)[:buffer // 4] for l in lines if l]).strip() + '\n' 57 | value = r.choice(first_column_values) 58 | return value, csv, buffer 59 | 60 | def expected(value, csv): 61 | res = [] 62 | found = False 63 | lines = csv.splitlines() 64 | lines = [l.split(',') for l in lines] 65 | lines = sorted(lines, key=lambda x: x[0]) 66 | for cols in lines: 67 | line = ','.join(str(c) for c in cols) 68 | if found: 69 | res.append(line) 70 | else: 71 | if cols: 72 | if cols[0] >= value: 73 | res.append(line) 74 | found = True 75 | return '\n'.join(res) + '\n' 76 | 77 | @given(inputs()) 78 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 79 | def test_props(args): 80 | value, csv, buffer = args 81 | result = expected(value, csv) 82 | assert set(result.splitlines()) == set(run(csv, f'bsv.{buffer} | bsort.{buffer} | bdropuntil.{buffer} "{value}" | csv.{buffer}').splitlines()) # set because sort is not stable and is only for first column values 83 | 84 | def test_example1(): 85 | value, csv = 'g', 'a\nb\nc\nd\ne\nf\ng\nh\n' 86 | result = expected(value, csv) 87 | assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null') 88 | 89 | def test_example2(): 90 | value, csv = 'a', 'a\n' 91 | result = expected(value, csv) 92 | assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null') 93 | 94 | def test_example3(): 95 | value, csv = 'ga', 'a\nb\nc\nddd\neee\nf\nga\n' 96 | result = expected(value, csv) 97 | assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null') 98 | 99 | def test_example4(): 100 | value, csv = 'b', 'a\na\na\nb\n' 101 | result = expected(value, csv) 102 | assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null') 103 | -------------------------------------------------------------------------------- /test/blz4d_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | import random 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import lists, composite, integers, text, sampled_from 8 | from test_util import clone_source, compile_buffer_sizes 9 | 10 | if os.environ.get('TEST_FACTOR'): 11 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 12 | else: 13 | buffers = [128] 14 | 15 | def setup_module(m): 16 | m.tempdir = clone_source() 17 | m.orig = os.getcwd() 18 | m.path = os.environ['PATH'] 19 | os.chdir(m.tempdir) 20 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 21 | shell.run('make clean && make bsv csv blz4 blz4d', stream=True) 22 | compile_buffer_sizes('csv', buffers) 23 | compile_buffer_sizes('bsv', buffers) 24 | compile_buffer_sizes('blz4', buffers) 25 | compile_buffer_sizes('blz4d', buffers) 26 | 27 | def teardown_module(m): 28 | os.chdir(m.orig) 29 | os.environ['PATH'] = m.path 30 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 31 | shell.run('rm -rf', m.tempdir) 32 | 33 | @composite 34 | def inputs(draw): 35 | buffer = draw(sampled_from(buffers)) 36 | num_columns = draw(integers(min_value=1, max_value=12)) 37 | column = text(string.ascii_lowercase, min_size=1) 38 | columns = lists(column, min_size=num_columns, max_size=num_columns) 39 | lines = draw(lists(columns, min_size=1)) 40 | csv = '\n'.join([','.join(line)[:64] for line in lines]) 41 | return buffer, csv 42 | 43 | @given(inputs()) 44 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 45 | def test_props(args): 46 | buffer, csv = args 47 | assert csv == shell.run(f'bsv.{buffer} | blz4.{buffer} | blz4d.{buffer} | csv.{buffer}', stdin=csv) 48 | -------------------------------------------------------------------------------- /test/bpartition_lz4_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | import collections 5 | import xxh3 6 | from hypothesis.database import ExampleDatabase 7 | from hypothesis import given, settings 8 | from hypothesis.strategies import text, lists, composite, integers, tuples 9 | from test_util import unindent, rm_whitespace, clone_source 10 | 11 | def setup_module(m): 12 | m.tempdir = clone_source() 13 | m.orig = os.getcwd() 14 | m.path = os.environ['PATH'] 15 | os.chdir(m.tempdir) 16 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 17 | shell.run('make clean && make bsv csv blz4d bcat bpartition', stream=True) 18 | 19 | def teardown_module(m): 20 | os.chdir(m.orig) 21 | os.environ['PATH'] = m.path 22 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 23 | shell.run('rm -rf', m.tempdir) 24 | 25 | @composite 26 | def inputs(draw): 27 | num_buckets = draw(integers(min_value=1, max_value=128)) 28 | num_columns = draw(integers(min_value=1, max_value=12)) 29 | column = text(string.ascii_lowercase, min_size=1) 30 | columns = lists(column, min_size=1, max_size=num_columns) 31 | lines = draw(lists(columns, min_size=1)) 32 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 33 | return num_buckets, csv 34 | 35 | def expected(num_buckets, csv): 36 | res = collections.defaultdict(list) 37 | size = len(str(num_buckets)) 38 | for line in csv.splitlines(): 39 | col0 = line.split(',', 1)[0] 40 | bucket = xxh3.oneshot_int(col0.encode()) % num_buckets 41 | res[str(bucket).zfill(size)].append(line) 42 | val = '' 43 | for k in sorted(res): 44 | for line in res[k]: 45 | val += f'prefix_{k}:{line}\n' 46 | return val.strip() 47 | 48 | @given(inputs()) 49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 50 | def test_props(args): 51 | num_buckets, csv = args 52 | result = expected(num_buckets, csv) 53 | with shell.tempdir(): 54 | stdout = '\n'.join(sorted({l.split(':')[0] for l in result.splitlines()})) 55 | assert stdout == shell.run(f'bsv | bpartition -l {num_buckets} prefix', stdin=csv, echo=True) 56 | assert result == shell.run('bcat -l -p prefix*') 57 | 58 | def test_without_prefix(): 59 | with shell.tempdir(): 60 | stdin = """ 61 | b,c,d 62 | e,f,g 63 | h,i,j 64 | """ 65 | stdout = """ 66 | 02 67 | 04 68 | 05 69 | """ 70 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10', stdin=unindent(stdin)) 71 | 72 | def test_basic(): 73 | with shell.tempdir(): 74 | stdin = """ 75 | b,c,d 76 | e,f,g 77 | h,i,j 78 | """ 79 | stdout = """ 80 | prefix_02 81 | prefix_04 82 | prefix_05 83 | """ 84 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin)) 85 | stdout = """ 86 | prefix_02:h,i,j 87 | prefix_04:e,f,g 88 | prefix_05:b,c,d 89 | """ 90 | assert unindent(stdout).strip() == shell.run('bcat -l -p prefix*') 91 | stdout = """ 92 | prefix_02 93 | prefix_04 94 | prefix_05 95 | """ 96 | assert unindent(stdout).strip() == shell.run('ls prefix*') 97 | 98 | def test_appends(): 99 | with shell.tempdir(): 100 | stdin = """ 101 | b,c,d 102 | e,f,g 103 | h,i,j 104 | """ 105 | stdout = """ 106 | prefix_02 107 | prefix_04 108 | prefix_05 109 | """ 110 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin)) 111 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin)) 112 | stdout = """ 113 | prefix_02:h,i,j 114 | prefix_02:h,i,j 115 | prefix_04:e,f,g 116 | prefix_04:e,f,g 117 | prefix_05:b,c,d 118 | prefix_05:b,c,d 119 | """ 120 | assert unindent(stdout).strip() == shell.run('bcat -l -p prefix*') 121 | stdout = """ 122 | prefix_02 123 | prefix_04 124 | prefix_05 125 | """ 126 | assert unindent(stdout).strip() == shell.run('ls prefix*') 127 | -------------------------------------------------------------------------------- /test/bpartition_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | import collections 5 | import xxh3 6 | from hypothesis.database import ExampleDatabase 7 | from hypothesis import given, settings 8 | from hypothesis.strategies import text, lists, composite, integers, tuples 9 | from test_util import unindent, rm_whitespace, clone_source 10 | 11 | def setup_module(m): 12 | m.tempdir = clone_source() 13 | m.orig = os.getcwd() 14 | m.path = os.environ['PATH'] 15 | os.chdir(m.tempdir) 16 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 17 | shell.run('make clean && make bsv csv bcat bpartition', stream=True) 18 | 19 | def teardown_module(m): 20 | os.chdir(m.orig) 21 | os.environ['PATH'] = m.path 22 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 23 | shell.run('rm -rf', m.tempdir) 24 | 25 | @composite 26 | def inputs(draw): 27 | num_buckets = draw(integers(min_value=1, max_value=128)) 28 | num_columns = draw(integers(min_value=1, max_value=12)) 29 | column = text(string.ascii_lowercase, min_size=1) 30 | columns = lists(column, min_size=1, max_size=num_columns) 31 | lines = draw(lists(columns, min_size=1)) 32 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 33 | return num_buckets, csv 34 | 35 | def expected(num_buckets, csv): 36 | res = collections.defaultdict(list) 37 | size = len(str(num_buckets)) 38 | for line in csv.splitlines(): 39 | col0 = line.split(',', 1)[0] 40 | bucket = xxh3.oneshot_int(col0.encode()) % num_buckets 41 | res[str(bucket).zfill(size)].append(line) 42 | val = '' 43 | for k in sorted(res): 44 | for line in res[k]: 45 | val += f'prefix_{k}:{line}\n' 46 | return val.strip() 47 | 48 | @given(inputs()) 49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 50 | def test_props(args): 51 | num_buckets, csv = args 52 | result = expected(num_buckets, csv) 53 | with shell.tempdir(): 54 | stdout = '\n'.join(sorted({l.split(':')[0] for l in result.splitlines()})) 55 | assert stdout == shell.run(f'bsv | bpartition {num_buckets} prefix', stdin=csv, echo=True) 56 | assert result == shell.run('bcat --prefix prefix*') 57 | 58 | def test_without_prefix(): 59 | with shell.tempdir(): 60 | stdin = """ 61 | b,c,d 62 | e,f,g 63 | h,i,j 64 | """ 65 | stdout = """ 66 | 02 67 | 04 68 | 05 69 | """ 70 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10', stdin=unindent(stdin)) 71 | 72 | def test_basic(): 73 | with shell.tempdir(): 74 | stdin = """ 75 | b,c,d 76 | e,f,g 77 | h,i,j 78 | """ 79 | stdout = """ 80 | prefix_02 81 | prefix_04 82 | prefix_05 83 | """ 84 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin)) 85 | stdout = """ 86 | prefix_02:h,i,j 87 | prefix_04:e,f,g 88 | prefix_05:b,c,d 89 | """ 90 | assert unindent(stdout).strip() == shell.run('bcat --prefix prefix*') 91 | stdout = """ 92 | prefix_02 93 | prefix_04 94 | prefix_05 95 | """ 96 | assert unindent(stdout).strip() == shell.run('ls prefix*') 97 | 98 | def test_appends(): 99 | with shell.tempdir(): 100 | stdin = """ 101 | b,c,d 102 | e,f,g 103 | h,i,j 104 | """ 105 | stdout = """ 106 | prefix_02 107 | prefix_04 108 | prefix_05 109 | """ 110 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin)) 111 | assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin)) 112 | stdout = """ 113 | prefix_02:h,i,j 114 | prefix_02:h,i,j 115 | prefix_04:e,f,g 116 | prefix_04:e,f,g 117 | prefix_05:b,c,d 118 | prefix_05:b,c,d 119 | """ 120 | assert unindent(stdout).strip() == shell.run('bcat --prefix prefix*') 121 | stdout = """ 122 | prefix_02 123 | prefix_04 124 | prefix_05 125 | """ 126 | assert unindent(stdout).strip() == shell.run('ls prefix*') 127 | -------------------------------------------------------------------------------- /test/brmerge_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import text, lists, composite, integers 7 | from test_util import clone_source 8 | import os 9 | import shell 10 | 11 | def setup_module(m): 12 | m.tempdir = clone_source() 13 | m.orig = os.getcwd() 14 | m.path = os.environ['PATH'] 15 | os.chdir(m.tempdir) 16 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 17 | shell.run('make clean && make bsv csv bsort bcut bmerge', stream=True) 18 | 19 | def teardown_module(m): 20 | os.chdir(m.orig) 21 | os.environ['PATH'] = m.path 22 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 23 | shell.run('rm -rf', m.tempdir) 24 | 25 | @composite 26 | def inputs(draw): 27 | num_inputs = 2 28 | csvs = [] 29 | for _ in range(num_inputs): 30 | num_columns = draw(integers(min_value=1, max_value=2)) 31 | column = text(string.ascii_lowercase, min_size=1, max_size=4) 32 | line = lists(column, min_size=num_columns, max_size=num_columns) 33 | lines = draw(lists(line)) 34 | csv = '\n'.join(sorted([','.join(x) for x in lines], reverse=True)) + '\n' 35 | csvs.append(csv) 36 | return csvs 37 | 38 | def expected(csvs): 39 | xs = [] 40 | for csv in csvs: 41 | xs += csv.splitlines() 42 | xs = sorted([x.split(',')[0] for x in xs], reverse=True) 43 | return '\n'.join(xs) + '\n' 44 | 45 | @given(inputs()) 46 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 47 | def test_props(csvs): 48 | result = expected(csvs) 49 | if result.strip(): 50 | with shell.tempdir(): 51 | paths = [] 52 | for i, csv in enumerate(csvs): 53 | path = f'file{i}.bsv' 54 | shell.run(f'bsv > {path}', stdin=csv) 55 | paths.append(path) 56 | assert result.strip() == shell.run('echo', *paths, '| bmerge -r | bcut 1 | csv', echo=True) 57 | assert shell.run('cat', *paths, '| bsort -r | bcut 1 | csv') == shell.run('echo', *paths, '| bmerge -r | bcut 1 | csv') 58 | -------------------------------------------------------------------------------- /test/brsort_f64_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, floats 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut bsort bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=3)) 27 | column = floats(allow_nan=False, min_value=1e-10, max_value=1e10) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | lines = [','.join(map(str, line)) for line in lines] 31 | return '\n'.join(lines) + '\n' 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [float(x.split(',')[0]) for x in xs if x] 36 | xs = sorted(xs, reverse=True) 37 | return [round(x, 2) for x in xs] 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | assert result == [round(float(x), 2) for x in run(csv, 'bsv | bschema a:f64,... | bsort f64 -r | bcut 1 | bschema f64:a | csv').splitlines() if x] 44 | -------------------------------------------------------------------------------- /test/brsort_i64_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut bsort bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=3)) 27 | column = integers(min_value=-9223372036854775806, max_value=9223372036854775806) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | lines = [','.join(map(str, line)) for line in lines] 31 | return '\n'.join(lines) + '\n' 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [int(x.split(',')[0]) for x in xs if x] 36 | xs = sorted(xs, reverse=True) 37 | return '\n'.join(map(str, xs)) + '\n' 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | assert result == run(csv, 'bsv | bschema a:i64,... | bsort i64 -r | bcut 1 | bschema i64:a | csv') 44 | -------------------------------------------------------------------------------- /test/brsort_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bsort bcut', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=64)) 27 | column = text(string.ascii_lowercase, min_size=1, max_size=64) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | csv = '\n'.join([','.join(x) for x in lines]) + '\n' 31 | return csv 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [x.split(',')[0] for x in xs] 36 | xs = sorted(xs, reverse=True) 37 | return '\n'.join(xs) + '\n' 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | if result: 44 | assert result == run(csv, 'bsv | bsort -r | bcut 1 | csv') 45 | else: 46 | with pytest.raises(AssertionError): 47 | run(csv, 'bsv | bsort -r | bcut 1 | csv') 48 | 49 | @given(inputs()) 50 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 51 | def test_props_compatability(csv): 52 | assert run(csv, 'LC_ALL=C sort -r -k1,1 | cut -d, -f1') == run(csv, 'bsv | bsort --reversed | bcut 1 | csv') 53 | 54 | def test_compatability(): 55 | stdin = """ 56 | b 57 | c 58 | a 59 | """ 60 | stdout = """ 61 | c 62 | b 63 | a 64 | """ 65 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort --reversed | csv') 66 | -------------------------------------------------------------------------------- /test/brtopn_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut btopn bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | n = draw(integers(min_value=1, max_value=16)) 27 | num_columns = draw(integers(min_value=1, max_value=3)) 28 | column = text(string.ascii_lowercase, min_size=1, max_size=64) 29 | line = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(line)) 31 | lines = [','.join(map(str, line)) for line in lines] 32 | return n, '\n'.join(lines) + '\n' 33 | 34 | def expected(n, csv): 35 | xs = csv.splitlines() 36 | xs = [x.split(',')[0] for x in xs if x] 37 | xs = sorted(xs)[:n] 38 | return '\n'.join(xs) + '\n' 39 | 40 | @given(inputs()) 41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 42 | def test_props(args): 43 | n, csv = args 44 | result = expected(n, csv) 45 | assert result == run(csv, f'bsv | btopn {n} -r | bcut 1 | csv ') 46 | -------------------------------------------------------------------------------- /test/bschema_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import shell 4 | from test_util import clone_source 5 | 6 | def setup_module(m): 7 | m.tempdir = clone_source() 8 | m.orig = os.getcwd() 9 | m.path = os.environ['PATH'] 10 | os.chdir(m.tempdir) 11 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 12 | shell.run('make clean && make bsv csv bschema', stream=True) 13 | 14 | def teardown_module(m): 15 | os.chdir(m.orig) 16 | os.environ['PATH'] = m.path 17 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 18 | shell.run('rm -rf', m.tempdir) 19 | 20 | def test_basic(): 21 | assert '0' == shell.run('echo 0 | bsv | bschema a:u16 | bschema u16:a | csv') 22 | assert '5' == shell.run('echo 5 | bsv | bschema a:u16 | bschema u16:a | csv') 23 | assert '5' == shell.run('echo 5 | bsv | bschema a:u32 | bschema u32:a | csv') 24 | assert '5' == shell.run('echo 5 | bsv | bschema a:u64 | bschema u64:a | csv') 25 | assert '5' == shell.run('echo 5 | bsv | bschema a:i16 | bschema i16:a | csv') 26 | assert '5' == shell.run('echo 5 | bsv | bschema a:i32 | bschema i32:a | csv') 27 | assert '5' == shell.run('echo 5 | bsv | bschema a:i64 | bschema i64:a | csv') 28 | assert '5' == shell.run('echo 5 | bsv | bschema a:f32 | bschema f32:a | csv').split('.')[0] 29 | assert '5' == shell.run('echo 5 | bsv | bschema a:f64 | bschema f64:a | csv').split('.')[0] 30 | assert '1' == shell.run('echo 1 | bsv | bschema 1,... | csv') 31 | assert '1,2' == shell.run('echo 1,2,3 | bsv | bschema 1,1,... | csv') 32 | with pytest.raises(Exception): 33 | shell.run('echo 1,2,3 | bsv | bschema fake,schema,errors | csv') 34 | with pytest.raises(Exception): 35 | shell.run('echo 1,2,3 | bsv | bschema 1,1 | csv') 36 | with pytest.raises(Exception): 37 | shell.run('echo 1,2,3 | bsv | bschema 1,1,1,1 | csv') 38 | with pytest.raises(Exception): 39 | shell.run('echo 1,2,3 | bsv | bschema 1,2,1 | csv') 40 | assert '12593,12850,13107' == shell.run('echo 11,22,33 | bsv | bschema u16:a,u16:a,u16:a | csv') 41 | assert '1,2,3' == shell.run('echo 1,2,3 | bsv | bschema 1,1,1 | csv') 42 | assert '1' == shell.run('echo 1,2,3 | bsv | bschema 1,... | csv') 43 | assert '1,2' == shell.run('echo 1,2,3 | bsv | bschema *,*,... | csv') 44 | assert '11,22' == shell.run('echo 11,22,33 | bsv | bschema *,*,... | csv') 45 | assert 'df,er' == shell.run('echo asdf,qwer | bsv | bschema "*2,*2" | csv') 46 | assert 'as,qw' == shell.run('echo asdf,qwer | bsv | bschema "2*,2*" | csv') 47 | with pytest.raises(Exception): 48 | shell.run('echo a,qwer,123 | bsv | bschema "2*,2*" | csv') 49 | with pytest.raises(Exception): 50 | shell.run('echo -1 | bsv | bschema "a:u64" | csv') 51 | 52 | def test_filtering(): 53 | assert '1,1\n2,2' == shell.run('echo -e "1,1\n2,2\n3\n" | bsv | bschema 1,1 --filter | csv') 54 | assert '22\n33' == shell.run('echo -e "1\n22\n33\n" | bsv | bschema 2 --filter | csv') 55 | assert '12850\n13107' == shell.run('echo -e "1\n22\n33\n" | bsv | bschema u16:a --filter | csv') 56 | assert 'as\n12' == shell.run('echo -e "asdf\nq\n123\n" | bsv | bschema "2*" --filter | csv') 57 | 58 | def test_maxint(): 59 | with pytest.raises(Exception): 60 | shell.run('echo 32768 | bsv | bschema a:i16') 61 | with pytest.raises(Exception): 62 | shell.run('echo -32769 | bsv | bschema a:i16') 63 | with pytest.raises(Exception): 64 | shell.run('echo -1 | bsv | bschema a:u16') 65 | with pytest.raises(Exception): 66 | shell.run('echo 65536 | bsv | bschema a:u16') 67 | with pytest.raises(Exception): 68 | shell.run('echo 2147483648 | bsv | bschema a:i32') 69 | with pytest.raises(Exception): 70 | shell.run('echo -2147483649 | bsv | bschema a:i32') 71 | with pytest.raises(Exception): 72 | shell.run('echo -1 | bsv | bschema a:u32') 73 | with pytest.raises(Exception): 74 | shell.run('echo 4294967296 | bsv | bschema a:u32') 75 | with pytest.raises(Exception): 76 | shell.run('echo -9223372036854775808 | bsv | bschema a:i64') 77 | with pytest.raises(Exception): 78 | shell.run('echo 9223372036854775807 | bsv | bschema a:i64') 79 | with pytest.raises(Exception): 80 | shell.run('echo -1 | bsv | bschema a:u64') 81 | with pytest.raises(Exception): 82 | shell.run('echo 18446744073709551615 | bsv | bschema a:u64') 83 | -------------------------------------------------------------------------------- /test/bsort_f64_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, floats 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut bsort bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=3)) 27 | column = floats(allow_nan=False, min_value=1e-10, max_value=1e10) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | lines = [','.join(map(str, line)) for line in lines] 31 | return '\n'.join(lines) + '\n' 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [float(x.split(',')[0]) for x in xs if x] 36 | xs = sorted(xs) 37 | return [round(x, 2) for x in xs] 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | assert result == [round(float(x), 2) for x in run(csv, 'bsv | bschema a:f64,... | bsort f64 | bcut 1 | bschema f64:a | csv').splitlines() if x] 44 | -------------------------------------------------------------------------------- /test/bsort_i64_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut bsort bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=3)) 27 | column = integers(min_value=-9223372036854775806, max_value=9223372036854775806) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | lines = [','.join(map(str, line)) for line in lines] 31 | return '\n'.join(lines) + '\n' 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [int(x.split(',')[0]) for x in xs if x] 36 | xs = sorted(xs) 37 | return '\n'.join(map(str, xs)) + '\n' 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | assert result == run(csv, 'bsv | bschema a:i64,... | bsort i64 | bcut 1 | bschema i64:a | csv') 44 | 45 | 46 | @given(inputs()) 47 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 48 | def test_props_compatability(csv): 49 | assert run(csv, 'LC_ALL=C sort -n -k1,1 | cut -d, -f1') == run(csv, 'bsv | bschema a:i64,... | bsort i64 | bcut 1 | bschema i64:a | csv') 50 | -------------------------------------------------------------------------------- /test/bsort_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bschema bcut bsort', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | num_columns = draw(integers(min_value=1, max_value=16)) 27 | column = text(string.ascii_letters + ':/|', min_size=1, max_size=64) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line)) 30 | csv = '\n'.join([','.join(x) for x in lines]) + '\n' 31 | return csv 32 | 33 | def expected(csv): 34 | xs = csv.splitlines() 35 | xs = [x.split(',')[0] for x in xs] 36 | xs = sorted(xs) 37 | return '\n'.join(xs) + '\n' 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(csv): 42 | result = expected(csv) 43 | if result: 44 | assert result == run(csv, 'bsv | bsort | bcut 1 | csv') 45 | else: 46 | with pytest.raises(AssertionError): 47 | run(csv, 'bsv | bsort | csv') 48 | 49 | @given(inputs()) 50 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 51 | def test_props_compatability(csv): 52 | assert run(csv, 'LC_ALL=C sort -k1,1 | cut -d, -f1') == run(csv, 'bsv | bsort | bcut 1 | csv') 53 | 54 | def test_basic2(): 55 | stdin = """ 56 | a,b 57 | aa,a 58 | """ 59 | stdout = """ 60 | a,b 61 | aa,a 62 | """ 63 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv') 64 | 65 | def test_basic(): 66 | stdin = """ 67 | aa 68 | a 69 | """ 70 | stdout = """ 71 | a 72 | aa 73 | """ 74 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv') 75 | 76 | def test_compatability(): 77 | stdin = """ 78 | c 79 | b 80 | a 81 | """ 82 | stdout = """ 83 | a 84 | b 85 | c 86 | """ 87 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv') 88 | 89 | def test_compatability2(): 90 | stdin = """ 91 | c,c 92 | b,b 93 | a,a 94 | """ 95 | stdout = """ 96 | a,a 97 | b,b 98 | c,c 99 | """ 100 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv') 101 | -------------------------------------------------------------------------------- /test/bsplit_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import composite, integers, sampled_from 7 | from test_util import compile_buffer_sizes, clone_source 8 | 9 | if os.environ.get('TEST_FACTOR'): 10 | buffers = list(sorted(set([64, 128, 256, 1024, 1024 * 1024 * 5] + [random.randint(64, 1024) for _ in range(10)]))) 11 | else: 12 | buffers = [128] 13 | 14 | def setup_module(m): 15 | m.tempdir = clone_source() 16 | m.orig = os.getcwd() 17 | m.path = os.environ['PATH'] 18 | os.chdir(m.tempdir) 19 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 20 | shell.run('make clean', stream=True) 21 | compile_buffer_sizes('bsv', buffers) 22 | compile_buffer_sizes('csv', buffers) 23 | compile_buffer_sizes('bsplit', buffers) 24 | shell.run('make bsv csv bsplit xxh3 _gen_csv') 25 | 26 | def teardown_module(m): 27 | os.chdir(m.orig) 28 | os.environ['PATH'] = m.path 29 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 30 | shell.run('rm -rf', m.tempdir) 31 | 32 | @composite 33 | def inputs(draw): 34 | buffer = draw(sampled_from(buffers)) 35 | lines = draw(integers(min_value=0, max_value=1024 * 8)) 36 | chunks_per_file = draw(integers(min_value=0, max_value=64)) 37 | return buffer, lines, chunks_per_file 38 | 39 | @given(inputs()) 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 41 | def test_props(args): 42 | buffer, lines, chunks_per_file = args 43 | if not chunks_per_file: 44 | chunks_per_file = '' 45 | with shell.tempdir(): 46 | shell.run(f'_gen_csv 2 {lines} | bsv.{buffer} > data.bsv', echo=True) 47 | shell.run(f'cat data.bsv | bsplit.{buffer} prefix {chunks_per_file} > filenames') 48 | assert shell.run(f'cat data.bsv | csv.{buffer} | xxh3') == shell.run(f'cat filenames | while read path; do cat $path; done | csv.{buffer} | xxh3') 49 | -------------------------------------------------------------------------------- /test/bsum_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import text, lists, composite, integers 7 | from test_util import run, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bschema bsum bcut', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | num_columns = draw(integers(min_value=1, max_value=64)) 26 | column = text(string.digits, min_size=1, max_size=16) 27 | line = lists(column, min_size=num_columns, max_size=num_columns) 28 | lines = draw(lists(line)) 29 | csv = '\n'.join([','.join(x) for x in lines]) + '\n' 30 | return csv 31 | 32 | def expected(csv): 33 | val = 0 34 | for line in csv.splitlines(): 35 | col = line.split(',')[0] 36 | if col: 37 | val += int(col) 38 | return val 39 | 40 | @given(inputs()) 41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 42 | def test_props(args): 43 | csv = args 44 | result = expected(csv) 45 | assert result == int(run(csv, 'bsv | bschema a:i64,... | bsum i64 | bcut 1 | bschema i64:a | csv')) 46 | 47 | def test1(): 48 | stdin = """ 49 | 1 50 | 1 51 | 1 52 | """ 53 | assert '3' == shell.run('bsv | bschema a:i64 | bsum i64 | bschema i64:a | csv', stdin=stdin) 54 | -------------------------------------------------------------------------------- /test/bsumeach_f64_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 4 | 5 | def setup_module(m): 6 | m.tempdir = clone_source() 7 | m.orig = os.getcwd() 8 | m.path = os.environ['PATH'] 9 | os.chdir(m.tempdir) 10 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 11 | shell.run('make clean && make bsv csv bschema bsumeach', stream=True) 12 | 13 | def teardown_module(m): 14 | os.chdir(m.orig) 15 | os.environ['PATH'] = m.path 16 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 17 | shell.run('rm -rf', m.tempdir) 18 | 19 | def test_basic(): 20 | stdin = """ 21 | a,1.1 22 | a,2.1 23 | a,3.1 24 | b,4.1 25 | b,5.1 26 | a,6.1 27 | """ 28 | stdout = """ 29 | a,6.3 30 | b,9.2 31 | a,6.1 32 | """ 33 | result = run(rm_whitespace(stdin), 'bsv | bschema *,a:f64 | bsumeach f64 | bschema *,f64:a | csv') 34 | result = '\n'.join(f'{k},{round(float(v), 3)}' for line in result.splitlines() for k, v in [line.split(',')]) + '\n' 35 | assert rm_whitespace(stdout) + '\n' == result 36 | -------------------------------------------------------------------------------- /test/bsumeach_hash_i64_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 4 | 5 | def setup_module(m): 6 | m.tempdir = clone_source() 7 | m.orig = os.getcwd() 8 | m.path = os.environ['PATH'] 9 | os.chdir(m.tempdir) 10 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 11 | shell.run('make clean && make bsv csv bsort bschema bsumeach-hash', stream=True) 12 | 13 | def teardown_module(m): 14 | os.chdir(m.orig) 15 | os.environ['PATH'] = m.path 16 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 17 | shell.run('rm -rf', m.tempdir) 18 | 19 | def test_basic(): 20 | stdin = """ 21 | a,1 22 | a,2 23 | a,3 24 | b,4 25 | b,5 26 | a,6 27 | """ 28 | stdout = """ 29 | a,12 30 | b,9 31 | """ 32 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bschema *,a:i64 | bsumeach-hash i64 | bschema *,i64:a | bsort | csv') 33 | -------------------------------------------------------------------------------- /test/bsumeach_hash_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nathants/bsv/1e8924d6e169b117138731cb90eafc8c626bea47/test/bsumeach_hash_test.py -------------------------------------------------------------------------------- /test/bsumeach_i64_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source 4 | 5 | def setup_module(m): 6 | m.tempdir = clone_source() 7 | m.orig = os.getcwd() 8 | m.path = os.environ['PATH'] 9 | os.chdir(m.tempdir) 10 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 11 | shell.run('make clean && make bsv csv bschema bsumeach', stream=True) 12 | 13 | def teardown_module(m): 14 | os.chdir(m.orig) 15 | os.environ['PATH'] = m.path 16 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 17 | shell.run('rm -rf', m.tempdir) 18 | 19 | def test_basic(): 20 | stdin = """ 21 | a,1 22 | a,2 23 | a,3 24 | b,4 25 | b,5 26 | a,6 27 | """ 28 | stdout = """ 29 | a,6 30 | b,9 31 | a,6 32 | """ 33 | assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bschema *,a:i64 | bsumeach i64 | bschema *,i64:a | csv') 34 | -------------------------------------------------------------------------------- /test/btake_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text 7 | from test_util import run, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv btake', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | r = draw(randoms()) 26 | num_text_columns = draw(integers(min_value=1, max_value=4)) 27 | text_column = text(string.ascii_lowercase, min_size=1, max_size=8) 28 | text_line = lists(text_column, min_size=num_text_columns, max_size=num_text_columns) 29 | lines = draw(lists(text_line, min_size=1)) 30 | first_column_values = [line[0] for line in lines] 31 | threshold = draw(floats(min_value=0, max_value=1)) 32 | for line in lines: 33 | if line and r.random() > threshold: 34 | line[0] = r.choice(first_column_values) 35 | csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n' 36 | value = r.choice(first_column_values) 37 | return value, csv 38 | 39 | def parse(value): 40 | if value.isdigit(): 41 | value = int(value) 42 | return value 43 | 44 | def expected(value, csv): 45 | value = parse(value) 46 | res = [] 47 | for line in csv.splitlines(): 48 | columns = line.split(',') 49 | if columns and parse(columns[0]) != value: 50 | break 51 | res.append(','.join(str(parse(x)) for x in columns)) 52 | return '\n'.join(res) + '\n' 53 | 54 | @given(inputs()) 55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 56 | def test_props(args): 57 | value, csv = args 58 | result = expected(value, csv) 59 | assert result == run(csv, f'bsv | btake "{value}" | csv') 60 | -------------------------------------------------------------------------------- /test/btakeuntil_i64_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shell 3 | import string 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text 7 | from test_util import run, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bschema bsort btakeuntil', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | r = draw(randoms()) 26 | num_columns = draw(integers(min_value=1, max_value=4)) 27 | column = integers(min_value=-9223372036854775806, max_value=9223372036854775806) 28 | line = lists(column, min_size=num_columns, max_size=num_columns) 29 | lines = draw(lists(line, min_size=1)) 30 | lines = [[str(x) for x in line] for line in lines] 31 | first_column_values = [line[0] for line in lines] 32 | threshold = draw(floats(min_value=0, max_value=1)) 33 | for line in lines: 34 | if line and r.random() > threshold: 35 | line[0] = r.choice(first_column_values) 36 | csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n' 37 | value = r.choice(first_column_values) 38 | return value, csv 39 | 40 | def expected(value, csv): 41 | value = int(value) 42 | res = [] 43 | lines = csv.splitlines() 44 | lines = [[int(x) for x in line.split(',')] for line in lines] 45 | lines = sorted(lines) 46 | print(value, lines) 47 | for cols in lines: 48 | if cols: 49 | if cols[0] >= value: 50 | break 51 | res.append(str(cols[0])) 52 | return '\n'.join(res) + '\n' 53 | 54 | @given(inputs()) 55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 56 | def test_props(args): 57 | value, csv = args 58 | result = expected(value, csv) 59 | assert result.splitlines() == run(csv, f'bsv | bschema a:i64,... | bsort i64 | btakeuntil "{value}" i64 | bschema i64:a | csv').splitlines() 60 | -------------------------------------------------------------------------------- /test/btopn_i64_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, integers 8 | from test_util import run, rm_whitespace, clone_source 9 | 10 | def setup_module(m): 11 | m.tempdir = clone_source() 12 | m.orig = os.getcwd() 13 | m.path = os.environ['PATH'] 14 | os.chdir(m.tempdir) 15 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 16 | shell.run('make clean && make bsv csv bcut btopn bschema', stream=True) 17 | 18 | def teardown_module(m): 19 | os.chdir(m.orig) 20 | os.environ['PATH'] = m.path 21 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 22 | shell.run('rm -rf', m.tempdir) 23 | 24 | @composite 25 | def inputs(draw): 26 | n = draw(integers(min_value=1, max_value=16)) 27 | num_columns = draw(integers(min_value=1, max_value=3)) 28 | column = integers(min_value=-9223372036854775806, max_value=9223372036854775806) 29 | line = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(line)) 31 | lines = [','.join(map(str, line)) for line in lines] 32 | return n, '\n'.join(lines) + '\n' 33 | 34 | def expected(n, csv): 35 | xs = csv.splitlines() 36 | xs = [int(x.split(',')[0]) for x in xs if x] 37 | xs = sorted(xs, reverse=True)[:n] 38 | return '\n'.join(map(str, xs)) + '\n' 39 | 40 | @given(inputs()) 41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 42 | def test_props(args): 43 | n, csv = args 44 | result = expected(n, csv) 45 | assert result == run(csv, f'bsv | bschema a:i64,... | btopn {n} i64 | bschema i64:a | csv ') 46 | -------------------------------------------------------------------------------- /test/btopn_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import text, lists, composite, integers, integers, sampled_from 8 | from test_util import run, clone_source, compile_buffer_sizes 9 | 10 | if os.environ.get('TEST_FACTOR'): 11 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 12 | else: 13 | buffers = [128] 14 | 15 | def setup_module(m): 16 | m.tempdir = clone_source() 17 | m.orig = os.getcwd() 18 | m.path = os.environ['PATH'] 19 | os.chdir(m.tempdir) 20 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 21 | shell.run('make clean', stream=True) 22 | compile_buffer_sizes('csv', buffers) 23 | compile_buffer_sizes('bsv', buffers) 24 | compile_buffer_sizes('bcut', buffers) 25 | compile_buffer_sizes('btopn', buffers) 26 | compile_buffer_sizes('bschema', buffers) 27 | shell.run('make bsv csv bcut btopn bschema', stream=True) 28 | 29 | def teardown_module(m): 30 | os.chdir(m.orig) 31 | os.environ['PATH'] = m.path 32 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 33 | shell.run('rm -rf', m.tempdir) 34 | 35 | @composite 36 | def inputs(draw): 37 | buffer = draw(sampled_from(buffers)) 38 | n = draw(integers(min_value=1, max_value=16)) 39 | num_columns = draw(integers(min_value=1, max_value=3)) 40 | column = text(string.ascii_lowercase, min_size=1, max_size=20) 41 | line = lists(column, min_size=num_columns, max_size=num_columns) 42 | lines = draw(lists(line)) 43 | lines = [','.join(map(str, line)) for line in lines] 44 | return buffer, n, '\n'.join(lines) + '\n' 45 | 46 | def expected(n, csv): 47 | xs = csv.splitlines() 48 | xs = [x.split(',')[0] for x in xs if x] 49 | xs = sorted(xs, reverse=True)[:n] 50 | return '\n'.join(xs) + '\n' 51 | 52 | @given(inputs()) 53 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 54 | def test_props(args): 55 | buffer, n, csv = args 56 | result = expected(n, csv) 57 | assert result == run(csv, f'bsv.{buffer} | btopn.{buffer} {n} | bcut.{buffer} 1 | csv.{buffer}') 58 | -------------------------------------------------------------------------------- /test/bunzip_lz4_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import shell 4 | from hypothesis.database import ExampleDatabase 5 | from hypothesis import given, settings 6 | from hypothesis.strategies import lists, composite, integers, text 7 | from test_util import run, clone_source 8 | 9 | def setup_module(m): 10 | m.tempdir = clone_source() 11 | m.orig = os.getcwd() 12 | m.path = os.environ['PATH'] 13 | os.chdir(m.tempdir) 14 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 15 | shell.run('make clean && make bsv csv bunzip blz4d', stream=True) 16 | 17 | def teardown_module(m): 18 | os.chdir(m.orig) 19 | os.environ['PATH'] = m.path 20 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 21 | shell.run('rm -rf', m.tempdir) 22 | 23 | @composite 24 | def inputs(draw): 25 | num_columns = draw(integers(min_value=1, max_value=12)) 26 | zipcol = integers(min_value=0, max_value=num_columns - 1) 27 | zipcols = draw(lists(zipcol, min_size=1, max_size=16)) 28 | column = text(string.ascii_lowercase, min_size=1) 29 | columns = lists(column, min_size=num_columns, max_size=num_columns) 30 | lines = draw(lists(columns, min_size=1)) 31 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 32 | return zipcols, csv 33 | 34 | @given(inputs()) 35 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 36 | def test_props(args): 37 | zipcols, csv = args 38 | just = max(len(str(zipcol)) for zipcol in zipcols) 39 | zipcols = [str(i).rjust(just, '0') for i in zipcols] 40 | for i, column in enumerate(run(csv, 'bsv | bunzip -l prefix').splitlines()): 41 | result = '\n'.join(row.split(',')[i] for row in csv.splitlines()) 42 | assert result == shell.run(f'< {column} blz4d | csv') 43 | -------------------------------------------------------------------------------- /test/bunzip_test.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import string 4 | import shell 5 | from hypothesis.database import ExampleDatabase 6 | from hypothesis import given, settings 7 | from hypothesis.strategies import lists, composite, integers, text, sampled_from 8 | from test_util import run, clone_source, compile_buffer_sizes 9 | 10 | if os.environ.get('TEST_FACTOR'): 11 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 12 | else: 13 | buffers = [128] 14 | 15 | def setup_module(m): 16 | m.tempdir = clone_source() 17 | m.orig = os.getcwd() 18 | m.path = os.environ['PATH'] 19 | os.chdir(m.tempdir) 20 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 21 | shell.run('make clean', stream=True) 22 | compile_buffer_sizes('csv', buffers) 23 | compile_buffer_sizes('bsv', buffers) 24 | compile_buffer_sizes('bunzip', buffers) 25 | compile_buffer_sizes('bcat', buffers) 26 | shell.run('make bsv csv bcat bunzip', stream=True) 27 | 28 | def teardown_module(m): 29 | os.chdir(m.orig) 30 | os.environ['PATH'] = m.path 31 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 32 | shell.run('rm -rf', m.tempdir) 33 | 34 | @composite 35 | def inputs(draw): 36 | buffer = draw(sampled_from(buffers)) 37 | num_columns = draw(integers(min_value=1, max_value=12)) 38 | zipcol = integers(min_value=0, max_value=num_columns - 1) 39 | zipcols = draw(lists(zipcol, min_size=1, max_size=16)) 40 | column = text(string.ascii_lowercase, min_size=1, max_size=5) 41 | columns = lists(column, min_size=num_columns, max_size=num_columns) 42 | lines = draw(lists(columns, min_size=1)) 43 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 44 | return buffer, zipcols, csv 45 | 46 | @given(inputs()) 47 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 48 | def test_props(args): 49 | buffer, zipcols, csv = args 50 | just = max(len(str(zipcol)) for zipcol in zipcols) 51 | zipcols = [str(i).rjust(just, '0') for i in zipcols] 52 | for i, column in enumerate(run(csv, f'bsv.{buffer} | bunzip.{buffer} prefix').splitlines()): 53 | result = '\n'.join(row.split(',')[i] for row in csv.splitlines()) 54 | assert result == shell.run('bcat', column) 55 | -------------------------------------------------------------------------------- /test/bzip_lz4_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import uuid 3 | import os 4 | import string 5 | import shell 6 | import random 7 | from hypothesis.database import ExampleDatabase 8 | from hypothesis import given, settings 9 | from hypothesis.strategies import lists, composite, integers, text, randoms, sampled_from 10 | from test_util import run, clone_source, compile_buffer_sizes 11 | 12 | if os.environ.get('TEST_FACTOR'): 13 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 14 | else: 15 | buffers = [128] 16 | 17 | def setup_module(m): 18 | m.tempdir = clone_source() 19 | m.orig = os.getcwd() 20 | m.path = os.environ['PATH'] 21 | os.chdir(m.tempdir) 22 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 23 | shell.run('make clean', stream=True) 24 | compile_buffer_sizes('csv', buffers) 25 | compile_buffer_sizes('bsv', buffers) 26 | compile_buffer_sizes('blz4', buffers) 27 | compile_buffer_sizes('bzip', buffers) 28 | compile_buffer_sizes('bunzip', buffers) 29 | shell.run('make bsv csv blz4 bzip bunzip', stream=True) 30 | 31 | def teardown_module(m): 32 | os.chdir(m.orig) 33 | os.environ['PATH'] = m.path 34 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 35 | shell.run('rm -rf', m.tempdir) 36 | 37 | @composite 38 | def inputs(draw): 39 | buffer = draw(sampled_from(buffers)) 40 | rand = draw(randoms()) 41 | num_columns = draw(integers(min_value=1, max_value=12)) 42 | zipcol = integers(min_value=0, max_value=num_columns - 1) 43 | zipcols = draw(lists(zipcol, min_size=1, max_size=16)) 44 | zipcols = list(set(zipcols)) 45 | rand.shuffle(zipcols) 46 | column = text(string.ascii_lowercase, min_size=1, max_size=5) 47 | columns = lists(column, min_size=num_columns, max_size=num_columns) 48 | lines = draw(lists(columns, min_size=1)) 49 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 50 | return buffer, zipcols, csv 51 | 52 | def expected(zipcols, csv): 53 | res = [] 54 | for line in csv.splitlines(): 55 | columns = line.split(',') 56 | res.append(','.join(columns[zipcol] for zipcol in zipcols)) 57 | return '\n'.join(res) + '\n' 58 | 59 | @given(inputs()) 60 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 61 | def test_props(args): 62 | buffer, zipcols, csv = args 63 | result = expected(zipcols, csv) 64 | cols = ','.join(str(i + 1) for i in zipcols) 65 | prefix = str(uuid.uuid4()) 66 | assert result == run(csv, f'bsv.{buffer} | bunzip.{buffer} -l {prefix} >/dev/null && ls {prefix}_* | bzip.{buffer} -l {cols} | csv.{buffer}') 67 | 68 | def test_selection(): 69 | shell.run('echo -e "a\nb\n" | bsv | blz4 > a') 70 | shell.run('echo -e "1\n2\n" | bsv | blz4 > b') 71 | assert '1,a\n2,b' == shell.run('echo a b | bzip -l 2,1 | csv') 72 | assert 'a,1\nb,2' == shell.run('echo a b | bzip -l 1,2 | csv') 73 | assert 'a\nb' == shell.run('echo a b | bzip -l 1 | csv') 74 | assert '1\n2' == shell.run('echo a b | bzip -l 2 | csv') 75 | with pytest.raises(Exception): 76 | assert '1\n2' == shell.run('echo a b | bzip -l 0 | csv') 77 | with pytest.raises(Exception): 78 | assert '1\n2' == shell.run('echo a b | bzip -l 3 | csv') 79 | with pytest.raises(Exception): 80 | assert '1\n2' == shell.run('echo a b | bzip -l 1,1 | csv') 81 | 82 | def test_different_lengths(): 83 | shell.run('echo -e "a\nb\nc\n" | bsv | blz4 > a') 84 | shell.run('echo -e "a\nb\n" | bsv | blz4 > b') 85 | with pytest.raises(Exception): 86 | shell.run('echo a b | bzip -l') 87 | 88 | def test_more_than_1_column(): 89 | shell.run('echo -e "a\nb\nc\n" | bsv | blz4 > a') 90 | shell.run('echo -e "a\nb\nc,c\n" | bsv | blz4 > b') 91 | with pytest.raises(Exception): 92 | shell.run('echo a b | bzip -l') 93 | -------------------------------------------------------------------------------- /test/bzip_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import uuid 3 | import os 4 | import string 5 | import shell 6 | import random 7 | from hypothesis.database import ExampleDatabase 8 | from hypothesis import given, settings 9 | from hypothesis.strategies import lists, composite, integers, text, randoms, sampled_from 10 | from test_util import run, clone_source, compile_buffer_sizes 11 | 12 | if os.environ.get('TEST_FACTOR'): 13 | buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)]))) 14 | else: 15 | buffers = [128] 16 | 17 | def setup_module(m): 18 | m.tempdir = clone_source() 19 | m.orig = os.getcwd() 20 | m.path = os.environ['PATH'] 21 | os.chdir(m.tempdir) 22 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 23 | shell.run('make clean', stream=True) 24 | compile_buffer_sizes('csv', buffers) 25 | compile_buffer_sizes('bsv', buffers) 26 | compile_buffer_sizes('bzip', buffers) 27 | compile_buffer_sizes('bunzip', buffers) 28 | shell.run('make bsv csv bzip bunzip', stream=True) 29 | 30 | 31 | def teardown_module(m): 32 | os.chdir(m.orig) 33 | os.environ['PATH'] = m.path 34 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 35 | shell.run('rm -rf', m.tempdir) 36 | 37 | @composite 38 | def inputs(draw): 39 | buffer = draw(sampled_from(buffers)) 40 | rand = draw(randoms()) 41 | num_columns = draw(integers(min_value=1, max_value=12)) 42 | zipcol = integers(min_value=0, max_value=num_columns - 1) 43 | zipcols = draw(lists(zipcol, min_size=1, max_size=16)) 44 | zipcols = list(set(zipcols)) 45 | rand.shuffle(zipcols) 46 | column = text(string.ascii_lowercase, min_size=1, max_size=5) 47 | columns = lists(column, min_size=num_columns, max_size=num_columns) 48 | lines = draw(lists(columns, min_size=1)) 49 | csv = '\n'.join([','.join(line) for line in lines]) + '\n' 50 | return buffer, zipcols, csv 51 | 52 | def expected(zipcols, csv): 53 | res = [] 54 | for line in csv.splitlines(): 55 | columns = line.split(',') 56 | res.append(','.join(columns[zipcol] for zipcol in zipcols)) 57 | return '\n'.join(res) + '\n' 58 | 59 | @given(inputs()) 60 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore 61 | def test_props(args): 62 | buffer, zipcols, csv = args 63 | result = expected(zipcols, csv) 64 | cols = ','.join(str(i + 1) for i in zipcols) 65 | prefix = str(uuid.uuid4()) 66 | assert result == run(csv, f'bsv.{buffer} | bunzip.{buffer} {prefix} >/dev/null && ls {prefix}_* | bzip.{buffer} {cols} | csv.{buffer}') 67 | 68 | def test_selection(): 69 | shell.run('echo -e "a\nb\n" | bsv > a') 70 | shell.run('echo -e "1\n2\n" | bsv > b') 71 | assert '1,a\n2,b' == shell.run('echo a b | bzip 2,1 | csv') 72 | assert 'a,1\nb,2' == shell.run('echo a b | bzip 1,2 | csv') 73 | assert 'a\nb' == shell.run('echo a b | bzip 1 | csv') 74 | assert '1\n2' == shell.run('echo a b | bzip 2 | csv') 75 | with pytest.raises(Exception): 76 | assert '1\n2' == shell.run('echo a b | bzip 0 | csv') 77 | with pytest.raises(Exception): 78 | assert '1\n2' == shell.run('echo a b | bzip 3 | csv') 79 | with pytest.raises(Exception): 80 | assert '1\n2' == shell.run('echo a b | bzip 1,1 | csv') 81 | 82 | def test_different_lengths(): 83 | shell.run('echo -e "a\nb\nc\n" | bsv > a') 84 | shell.run('echo -e "a\nb\n" | bsv > b') 85 | with pytest.raises(Exception): 86 | shell.run('echo a b | bzip') 87 | 88 | def test_more_than_1_column(): 89 | shell.run('echo -e "a\nb\nc\n" | bsv > a') 90 | shell.run('echo -e "a\nb\nc,c\n" | bsv > b') 91 | with pytest.raises(Exception): 92 | shell.run('echo a b | bzip') 93 | -------------------------------------------------------------------------------- /test/csv_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nathants/bsv/1e8924d6e169b117138731cb90eafc8c626bea47/test/csv_test.py -------------------------------------------------------------------------------- /test/test_util.py: -------------------------------------------------------------------------------- 1 | import shell 2 | import sys 3 | import uuid 4 | import os 5 | 6 | with shell.climb_git_root(): 7 | max_columns = int(shell.run('cat util/util.h | grep "define MAX_COLUMNS"').split()[-1]) 8 | 9 | def clone_source(): 10 | with shell.climb_git_root(): 11 | orig = os.getcwd() 12 | with shell.tempdir(cleanup=False): 13 | shell.run(f"rsync -avhc {orig}/ . --exclude '.git' --exclude '.tox' --exclude '.backups' --exclude '__pycache__' --exclude '.hypothesis' --exclude '.ccls-cache'") 14 | shell.run('mkdir .git') 15 | return os.getcwd() 16 | 17 | def run(stdin, *args): 18 | with shell.climb_git_root(): 19 | stdinpath = f'stdin.{uuid.uuid4()}' 20 | stdoutpath = f'stdout.{uuid.uuid4()}' 21 | with open(stdinpath, 'w') as f: 22 | f.write(stdin) 23 | shell.run(*(('set -o pipefail; cat', stdinpath, '|') + args + ('>', stdoutpath)), stream=True) 24 | with open(stdoutpath) as f: 25 | return f.read() 26 | 27 | def runb(stdin, *args): 28 | with shell.climb_git_root(): 29 | stdinpath = f'stdin.{uuid.uuid4()}' 30 | stdoutpath = f'stdout.{uuid.uuid4()}' 31 | if isinstance(stdin, str): 32 | with open(stdinpath, 'w') as f: 33 | f.write(stdin) 34 | else: 35 | with open(stdinpath, 'wb') as f: 36 | f.write(stdin) 37 | shell.run(*(('set -o pipefail; cat', stdinpath, '|') + args + ('>', stdoutpath)), stream=True) 38 | with open(stdoutpath, 'rb') as f: 39 | return f.read() 40 | 41 | def unindent(text): 42 | return '\n'.join([x.lstrip() for x in text.splitlines()]) + '\n' 43 | 44 | def rm_whitespace(x): 45 | return '\n'.join([y.strip().replace(' ', '') for y in x.splitlines() if y.strip()]) 46 | 47 | def compile_buffer_sizes(name, buffers): 48 | with shell.climb_git_root(): 49 | shell.run('cp -f util/util.h util/util.h.bak') 50 | try: 51 | for i in buffers: 52 | shell.run(f'cat util/util.h.bak | sed -E "s/#define BUFFER_SIZE .*/#define BUFFER_SIZE {i}/" > util/util.h') 53 | print('compile:', name, i, flush=True, file=sys.stderr) 54 | shell.run('make', name) 55 | shell.run(f'mv -f bin/{name} bin/{name}.{i}') 56 | finally: 57 | shell.run('cat util/util.h.bak > util/util.h') 58 | shell.run('rm -f util/util.h.bak') 59 | -------------------------------------------------------------------------------- /test/xxh3_test.py: -------------------------------------------------------------------------------- 1 | import shell 2 | import io 3 | import os 4 | import xxh3 5 | from test_util import clone_source 6 | 7 | def setup_module(m): 8 | m.tempdir = clone_source() 9 | m.orig = os.getcwd() 10 | m.path = os.environ['PATH'] 11 | os.chdir(m.tempdir) 12 | os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin' 13 | shell.run('make clean && make xxh3', stream=True) 14 | 15 | def teardown_module(m): 16 | os.chdir(m.orig) 17 | os.environ['PATH'] = m.path 18 | assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/') 19 | shell.run('rm -rf', m.tempdir) 20 | 21 | def test_hex(): 22 | assert '079364cbfdf9f4cb' == shell.run('echo abc | xxh3') 23 | assert '079364cbfdf9f4cb' == xxh3.oneshot_hex('abc\n'.encode()) 24 | 25 | def test_int(): 26 | assert '545890807144117451' == shell.run('echo abc | xxh3 --int') 27 | assert 545890807144117451 == xxh3.oneshot_int('abc\n'.encode()) 28 | 29 | def test_stream(): 30 | assert { 31 | 'cmd': 'set -eou pipefail; echo abc | xxh3 --stream', 32 | 'exitcode': 0, 33 | 'stderr': '079364cbfdf9f4cb', 34 | 'stdout': 'abc', 35 | } == shell.run('echo abc | xxh3 --stream', warn=True) 36 | assert '079364cbfdf9f4cb' == xxh3.stream_hex(io.BytesIO('abc\n'.encode())) 37 | assert { 38 | 'cmd': 'set -eou pipefail; echo abc | xxh3 --stream --int', 39 | 'exitcode': 0, 40 | 'stderr': '545890807144117451', 41 | 'stdout': 'abc', 42 | } == shell.run('echo abc | xxh3 --stream --int', warn=True) 43 | assert 545890807144117451 == xxh3.stream_int(io.BytesIO('abc\n'.encode())) 44 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = python3 3 | skipsdist = True 4 | 5 | [testenv] 6 | passenv = * 7 | whitelist_externals = bash 8 | commands = 9 | bash -xc 'py.test -n auto -vx --tb native --durations 40 test/' 10 | 11 | deps = 12 | git+https://github.com/nathants/py-util 13 | git+https://github.com/nathants/py-shell 14 | git+https://github.com/nathants/py-pool 15 | git+https://github.com/nathants/cffi-xxh3 16 | hypothesis 17 | pytest 18 | pytest-xdist 19 | numpy 20 | -------------------------------------------------------------------------------- /util/array.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define ARRAY_EXPAND_CAPACITY 1024 * 512 4 | 5 | #define ARRAY_INIT(array, type) \ 6 | u64 array##_size = 0; \ 7 | u64 array##_capacity = ARRAY_EXPAND_CAPACITY; \ 8 | type *array; \ 9 | MALLOC(array, sizeof(type) * array##_capacity); 10 | 11 | #define ARRAY_ADD(array, size, type) \ 12 | do { \ 13 | if (array##_size + size > array##_capacity) { \ 14 | array##_capacity += ARRAY_EXPAND_CAPACITY; \ 15 | REALLOC(array, sizeof(type) * array##_capacity); \ 16 | } \ 17 | array##_size += size; \ 18 | } while(0) 19 | 20 | #define ARRAY_APPEND(array, val, type) \ 21 | do { \ 22 | if (array##_size == array##_capacity) { \ 23 | array##_capacity += ARRAY_EXPAND_CAPACITY; \ 24 | REALLOC(array, sizeof(type) * array##_capacity); \ 25 | } \ 26 | array[array##_size++] = val; \ 27 | } while(0) 28 | 29 | #define ARRAY_POP(array, dst) \ 30 | do { \ 31 | if (array##_size) { \ 32 | dst = array[--array##_size]; \ 33 | } else { \ 34 | dst = NULL; \ 35 | } \ 36 | } while(0) 37 | 38 | #define ARRAY_RESET(array) \ 39 | do { \ 40 | array##_size = 0; \ 41 | } while(0) 42 | 43 | #define ARRAY_SIZE(array) \ 44 | array##_size 45 | -------------------------------------------------------------------------------- /util/dump.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "row.h" 4 | #include "write.h" 5 | 6 | #define ASSERT_SIZE_IS_VALID() ASSERT(row->sizes[i] <= MAX_COLUMNS - 1, "fatal: cannot have columns with more than 2**16 - 1 bytes, column: %d, size: %d, content: %.*s...\n", i, row->sizes[i], 10, row->columns[i]) 7 | #define ASSERT_MAX_IS_VALID() ASSERT(row->max <= MAX_COLUMNS, "fatal: cannot have more then 2**16 columns\n") 8 | 9 | // 10 | // NOTE: the memory pointed to by row->columns will be copied by 11 | // dump(), and can safely be mutated after the dump() returns. 12 | // 13 | inlined void dump(writebuf_t *wbuf, const row_t *row, i32 file) { 14 | ASSERT_MAX_IS_VALID(); 15 | i32 size = sizeof(u16) + (row->max + 1) * sizeof(u16); // -------------- init size with max:u16 + size1:u16,...sizen:u16 16 | for (i32 i = 0; i <= row->max; i++) 17 | size += row->sizes[i] + 1; // -------------------------------------- update size with column size + \0 18 | write_start(wbuf, size, file); // -------------------------------------- write start in case total size of writes would exceed the buffer 19 | write_bytes(wbuf, TO_UINT16(row->max), sizeof(u16), file); // ---------- write row->max 20 | for (i32 i = 0; i <= row->max; i++) { 21 | ASSERT_SIZE_IS_VALID(); 22 | write_bytes(wbuf, TO_UINT16(row->sizes[i]), sizeof(u16), file); // - write row->sizes 23 | } 24 | for (i32 i = 0; i <= row->max; i++) { 25 | write_bytes(wbuf, row->columns[i], row->sizes[i], file); // -------- write column 26 | write_bytes(wbuf, "\0", 1, file); // ------------------------------- add a trailing \0 after every column to make strcmp easier 27 | } 28 | } 29 | 30 | inlined void dump_raw(writebuf_t *wbuf, const raw_row_t *raw_row, i32 file) { 31 | write_start(wbuf, raw_row->header_size + raw_row->buffer_size, file); 32 | write_bytes(wbuf, raw_row->header, raw_row->header_size, file); 33 | write_bytes(wbuf, raw_row->buffer, raw_row->buffer_size, file); 34 | } 35 | 36 | void dump_flush(writebuf_t *wbuf, i32 file) { 37 | write_flush(wbuf, file); 38 | } 39 | -------------------------------------------------------------------------------- /util/load.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "row.h" 4 | #include "read.h" 5 | 6 | // 7 | // NOTE: the memory pointed to by row->columns can only be used until 8 | // the next call of load_next(), which may mutate that memory. if you 9 | // need it after that you must copy it somewhere else before the next 10 | // call of load_next(). 11 | // 12 | // NOTE: you must not mutate memory pointed to by row->columns, which 13 | // should be considered readonly. 14 | // 15 | inlined void load_next(readbuf_t *rbuf, row_t *row, i32 file) { 16 | read_bytes(rbuf, sizeof(u16), file); // ------------------------------------- read max, the max zero based index into columns data 17 | switch(rbuf->bytes) { 18 | case sizeof(u16): 19 | row->stop = 0; 20 | row->max = FROM_UINT16(rbuf->buffer); // ---------------------------- parse max 21 | read_bytes_assert(rbuf, (row->max + 1) * sizeof(u16), file); // ----- read sizes 22 | i32 size = row->max + 1; // ----------------------------------------- total size in bytes of all columns, including trailing \0 23 | for (i32 i = 0; i <= row->max; i++) { 24 | row->sizes[i] = FROM_UINT16(rbuf->buffer + i * sizeof(u16)); // - parse sizes 25 | size += row->sizes[i]; // --------------------------------------- update total size 26 | } 27 | read_bytes_assert(rbuf, size * sizeof(u8), file); // ---------------- row all column bytes 28 | row->columns[0] = rbuf->buffer; 29 | for (i32 i = 0; i < row->max; i++) 30 | row->columns[i + 1] = row->columns[i] + row->sizes[i] + 1; // --- setup pointers to read_buffer and skip trailing \0 31 | break; 32 | case 0: 33 | row->stop = 1; // --------------------------------------------------- empty read means EOF 34 | break; 35 | default: 36 | ASSERT(0, "fatal: row.h read size of row got bad num bytes, this should never happen\n"); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /util/queue.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | 5 | typedef struct node_s node_t; 6 | 7 | struct node_s { 8 | u8 *val; 9 | node_t *next; 10 | }; 11 | 12 | typedef struct queue_s { 13 | i32 size; 14 | i32 capacity; 15 | node_t *head; 16 | node_t *tail; 17 | } queue_t; 18 | 19 | queue_t *queue_init(i32 capacity) { 20 | queue_t *q; 21 | MALLOC(q, sizeof(*q)); 22 | q->size = 0; 23 | q->capacity = capacity; 24 | q->head = NULL; 25 | q->tail = NULL; 26 | return q; 27 | } 28 | 29 | i32 queue_put(queue_t *q, u8 *val) { 30 | if (q->size == q->capacity) 31 | return 1; 32 | node_t *n; 33 | MALLOC(n, sizeof(*n)); 34 | n->val = val; 35 | n->next = NULL; 36 | if (!q->head) { 37 | q->head = n; 38 | q->tail = n; 39 | q->size = 1; 40 | return 0; 41 | } 42 | q->tail->next = n; 43 | q->tail = n; 44 | q->size++; 45 | return 0; 46 | } 47 | 48 | u8 *queue_get(queue_t *q) { 49 | if (!q->size) 50 | return NULL; 51 | node_t *n = q->head; 52 | u8 *val = n->val; 53 | q->head = n->next; 54 | free(n); 55 | q->size--; 56 | return val; 57 | } 58 | -------------------------------------------------------------------------------- /util/read.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | #include "lz4.h" 5 | 6 | typedef struct readbuf_s { 7 | // public 8 | u8 *buffer; 9 | i32 bytes; 10 | // private 11 | FILE **files; 12 | u8 **buffers; 13 | i32 bytes_left; 14 | i32 bytes_read; 15 | i32 *offset; 16 | i32 *chunk_size; 17 | bool lz4; 18 | u8 *lz4_buf; 19 | i32 lz4_size; 20 | } readbuf_t; 21 | 22 | readbuf_t rbuf_init(FILE **files, i32 num_files, bool lz4) { 23 | readbuf_t *buf; 24 | MALLOC(buf, sizeof(readbuf_t)); 25 | buf->files = files; 26 | MALLOC(buf->buffers, sizeof(u8*) * num_files); 27 | MALLOC(buf->offset, sizeof(i32) * num_files); 28 | MALLOC(buf->chunk_size, sizeof(i32) * num_files); 29 | for (i32 i = 0; i < num_files; i++) { 30 | buf->chunk_size[i] = BUFFER_SIZE; 31 | buf->offset[i] = BUFFER_SIZE; 32 | MALLOC(buf->buffers[i], BUFFER_SIZE); 33 | } 34 | buf->lz4 = lz4; 35 | if (lz4) 36 | MALLOC(buf->lz4_buf, BUFFER_SIZE_LZ4); 37 | return *buf; 38 | } 39 | 40 | #define DECOMPRESS(buf) \ 41 | do { \ 42 | i32 decompressed_size = LZ4_decompress_safe(buf->lz4_buf, buf->buffers[file], buf->lz4_size, BUFFER_SIZE); \ 43 | ASSERT(buf->chunk_size[file] == decompressed_size, "fatal: decompress size mismatch\n"); \ 44 | } while(0) 45 | 46 | inlined void read_bytes(readbuf_t *buf, i32 size, i32 file) { 47 | buf->bytes_left = buf->chunk_size[file] - buf->offset[file]; // ------------------------------------ bytes left in the current chunk 48 | buf->bytes = size; 49 | ASSERT(buf->bytes_left >= 0, "fatal: negative bytes_left: %d\n", buf->bytes_left); 50 | if (buf->bytes_left == 0) { // --------------------------------------------------------------------- time to read the next chunk 51 | buf->bytes_read = fread_unlocked(&buf->chunk_size[file], 1, sizeof(i32), buf->files[file]); // - try read chunk size 52 | switch (buf->bytes_read) { 53 | case sizeof(i32): // ----------------------------------------------------------------------- read chunk size succeeded 54 | ASSERT(buf->chunk_size[file] <= BUFFER_SIZE, "fatal: bad chunk size: %d\n", buf->chunk_size[file]); 55 | #ifdef READ_GROWING // when defined hold all data in ram for sorting 56 | MALLOC(buf->buffers[file], buf->chunk_size[file]); 57 | #endif 58 | if (buf->lz4) { 59 | FREAD(&buf->lz4_size, sizeof(i32), buf->files[file]); // --------------------------- read compressed size 60 | FREAD(buf->lz4_buf, buf->lz4_size, buf->files[file]); // --------------------------- read compressed chunk 61 | DECOMPRESS(buf); 62 | } else 63 | FREAD(buf->buffers[file], buf->chunk_size[file], buf->files[file]); // ------------- read the chunk body 64 | buf->offset[file] = 0; // -------------------------------------------------------------- start at the beggining of the new chunk 65 | buf->bytes_left = buf->chunk_size[file]; // -------------------------------------------- bytes_left is the new chunk size 66 | ASSERT(size <= buf->bytes_left, "fatal: diskread, not possible, chunk sizes are known\n"); 67 | break; 68 | case 0: // --------------------------------------------------------------------------------- read chunk size failed 69 | ASSERT(!ferror_unlocked(buf->files[file]), "fatal: read error\n"); 70 | buf->chunk_size[file] = 0; 71 | buf->offset[file] = 0; 72 | buf->bytes = 0; 73 | break; 74 | default: 75 | ASSERT(0, "fatal: impossible\n"); 76 | } 77 | } else 78 | ASSERT(size <= buf->bytes_left, "fatal: ramread, not possible, chunk sizes are known\n"); 79 | buf->buffer = buf->buffers[file] + buf->offset[file]; // ------------------------------------------- update the buffer position for the current read 80 | buf->offset[file] += buf->bytes; // ---------------------------------------------------------------- update the buffer offset 81 | } 82 | 83 | inlined void read_bytes_assert(readbuf_t *buf, i32 size, i32 file) { 84 | read_bytes(buf, size, file); 85 | ASSERT(buf->bytes == size, "didnt read enough, only got: %d, expected: %d\n", (buf)->bytes, size); 86 | } 87 | -------------------------------------------------------------------------------- /util/read_ahead.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "read.h" 4 | #include "util.h" 5 | 6 | typedef struct readaheadbuf_s { 7 | i32 has_nexted; 8 | u8 **last_buffers; 9 | i32 *last_chunk_size; 10 | i32 _i32; 11 | u8 * _u8s; 12 | } readaheadbuf_t; 13 | 14 | readaheadbuf_t rabuf_init(i32 num_files) { 15 | readaheadbuf_t *buf; 16 | MALLOC(buf, sizeof(readaheadbuf_t)); 17 | buf->has_nexted = 0; 18 | MALLOC(buf->last_buffers, sizeof(u8*) * num_files); 19 | MALLOC(buf->last_chunk_size, sizeof(i32) * num_files); 20 | for (i32 i = 0; i < num_files; i++) { 21 | MALLOC(buf->last_buffers[i], BUFFER_SIZE); 22 | } 23 | return *buf; 24 | } 25 | 26 | inlined void swap(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) { 27 | // swap buffers 28 | rabuf->_u8s = rbuf->buffers[file]; 29 | rbuf->buffers[file] = rabuf->last_buffers[file]; 30 | rabuf->last_buffers[file] = rabuf->_u8s; 31 | // swap chunk sizes 32 | rabuf->_i32 = rbuf->chunk_size[file]; 33 | rbuf->chunk_size[file] = rabuf->last_chunk_size[file]; 34 | rabuf->last_chunk_size[file] = rabuf->_i32; 35 | } 36 | 37 | inlined void read_goto_next_chunk(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) { 38 | swap(rbuf, rabuf, file); 39 | rbuf->offset[file] = rbuf->chunk_size[file]; 40 | rabuf->has_nexted = 1; 41 | } 42 | 43 | inlined void read_goto_last_chunk(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) { 44 | rbuf->offset[file] = 0; 45 | if (rabuf->has_nexted) { 46 | // goto_last only does something if goto_next has been used, and results in: buffer = last_buf + current_buf 47 | swap(rbuf, rabuf, file); 48 | REALLOC(rbuf->buffers[file], rbuf->chunk_size[file] + rabuf->last_chunk_size[file]); 49 | memcpy(rbuf->buffers[file] + rbuf->chunk_size[file], rabuf->last_buffers[file], rabuf->last_chunk_size[file]); 50 | rbuf->chunk_size[file] += rabuf->last_chunk_size[file]; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /util/read_simple.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | 5 | typedef struct readbuf_s { 6 | // public 7 | i32 bytes; 8 | u8 *buffer; 9 | // private 10 | i32 *stop; 11 | i32 *offset; 12 | FILE **files; 13 | u8 **buffers; 14 | } readbuf_t; 15 | 16 | readbuf_t rbuf_init(FILE **files, i32 num_files) { 17 | readbuf_t *buf; 18 | MALLOC(buf, sizeof(readbuf_t)); 19 | buf->files = files; 20 | MALLOC(buf->stop, sizeof(i32) * num_files); 21 | for (i32 file = 0; file < num_files; file++) 22 | buf->stop[file] = 0; 23 | MALLOC(buf->offset, sizeof(i32) * num_files); 24 | MALLOC(buf->buffers, sizeof(u8*) * num_files); 25 | for (i32 file = 0; file < num_files; file++) { 26 | buf->offset[file] = BUFFER_SIZE; 27 | MALLOC(buf->buffers[file], BUFFER_SIZE); 28 | } 29 | return *buf; 30 | } 31 | 32 | inlined void read_bytes(readbuf_t *buf, i32 size, i32 file) { 33 | ASSERT(size <= BUFFER_SIZE, "error: cant read more bytes than %d\n", BUFFER_SIZE); 34 | if (buf->stop[file] == 0) { 35 | i32 bytes_left = BUFFER_SIZE - buf->offset[file]; 36 | buf->bytes = size; 37 | if (size > bytes_left) { 38 | memmove(buf->buffers[file], buf->buffers[file] + buf->offset[file], bytes_left); 39 | i32 bytes_todo = BUFFER_SIZE - bytes_left; 40 | i32 bytes = fread_unlocked(buf->buffers[file] + bytes_left, 1, bytes_todo, buf->files[file]); 41 | buf->offset[file] = 0; 42 | if (bytes_todo != bytes) { 43 | ASSERT(!ferror_unlocked(buf->files[file]), "error: couldnt read input\n"); 44 | buf->stop[file] = bytes_left + bytes; 45 | buf->bytes = MIN(size, bytes + bytes_left); 46 | } 47 | } 48 | } else 49 | buf->bytes = MIN(size, buf->stop[file] - buf->offset[file]); 50 | buf->buffer = buf->buffers[file] + buf->offset[file]; 51 | buf->offset[file] += buf->bytes; 52 | } 53 | -------------------------------------------------------------------------------- /util/row.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | 5 | typedef struct row_s { 6 | i32 stop; 7 | i32 max; 8 | i32 sizes[MAX_COLUMNS]; 9 | u8 *columns[MAX_COLUMNS]; 10 | } row_t; 11 | 12 | typedef struct raw_row_s { 13 | u16 meta; 14 | u8 *header; 15 | i32 header_size; 16 | u8 *buffer; 17 | i32 buffer_size; 18 | } raw_row_t; 19 | 20 | inlined void row_to_raw(row_t *row, raw_row_t *raw_row) { 21 | raw_row->header_size = sizeof(u16) + (row->max + 1) * sizeof(u16); 22 | raw_row->header = row->columns[0] - raw_row->header_size; 23 | raw_row->buffer = row->columns[0]; 24 | raw_row->buffer_size = 0; 25 | for (i32 i = 0; i <= row->max; i++) 26 | raw_row->buffer_size += row->sizes[i] + 1; 27 | } 28 | 29 | inlined void row_to_raw_malloc(row_t *row, raw_row_t *raw_row) { 30 | raw_row->header_size = sizeof(u16) + (row->max + 1) * sizeof(u16); 31 | MALLOC(raw_row->header, raw_row->header_size); 32 | memcpy(raw_row->header, row->columns[0] - raw_row->header_size, raw_row->header_size); 33 | raw_row->buffer_size = 0; 34 | for (i32 i = 0; i <= row->max; i++) 35 | raw_row->buffer_size += row->sizes[i] + 1; 36 | MALLOC(raw_row->buffer, raw_row->buffer_size); 37 | memcpy(raw_row->buffer, row->columns[0], raw_row->buffer_size); 38 | } 39 | 40 | inlined void raw_row_free(raw_row_t *raw_row) { 41 | free(raw_row->header); 42 | free(raw_row->buffer); 43 | } 44 | -------------------------------------------------------------------------------- /util/write.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | #include "lz4.h" 5 | 6 | typedef struct writebuf_s { 7 | // private 8 | FILE **files; 9 | u8 **buffer; 10 | i32 *offset; 11 | bool lz4; 12 | u8 *lz4_buf; 13 | i32 lz4_size; 14 | } writebuf_t; 15 | 16 | writebuf_t wbuf_init(FILE **files, i32 num_files, bool lz4) { 17 | writebuf_t *buf; 18 | MALLOC(buf, sizeof(writebuf_t)); 19 | buf->files = files; 20 | MALLOC(buf->buffer, sizeof(u8*) * num_files); 21 | MALLOC(buf->offset, sizeof(i32) * num_files); 22 | for (i32 i = 0; i < num_files; i++) { 23 | buf->offset[i] = 0; 24 | MALLOC(buf->buffer[i], BUFFER_SIZE); 25 | } 26 | buf->lz4 = lz4; 27 | if (lz4) 28 | MALLOC(buf->lz4_buf, BUFFER_SIZE_LZ4); 29 | return *buf; 30 | } 31 | 32 | inlined void write_bytes(writebuf_t *buf, u8 *bytes, i32 size, i32 file) { 33 | memcpy(buf->buffer[file] + buf->offset[file], bytes, size); 34 | buf->offset[file] += size; 35 | } 36 | 37 | #define COMPRESS(buf) \ 38 | LZ4_compress_fast(buf->buffer[file], buf->lz4_buf, buf->offset[file], BUFFER_SIZE_LZ4, LZ4_ACCELERATION) 39 | 40 | inlined void write_flush(writebuf_t *buf, i32 file) { 41 | if (buf->offset[file]) { // ------------------------------------------------ flush with an empty buffer is a nop 42 | FWRITE(&buf->offset[file], sizeof(i32), buf->files[file]); // ---------- write chunk size 43 | if (buf->lz4) { 44 | i32 lz4_size = COMPRESS(buf); // ----------------------------------- compress chunk 45 | FWRITE(&lz4_size, sizeof(i32), buf->files[file]); // ------ write compressed size 46 | FWRITE(buf->lz4_buf, lz4_size, buf->files[file]); // ------ write compressed chunk 47 | } else 48 | FWRITE(buf->buffer[file], buf->offset[file], buf->files[file]); // - write chunk 49 | buf->offset[file] = 0; // ---------------------------------------------- reset the buffer to prepare for the next write 50 | } 51 | } 52 | 53 | inlined void write_start(writebuf_t *buf, i32 size, i32 file) { 54 | ASSERT(size <= BUFFER_SIZE, "fatal: cant write larger than BUFFER_SIZE\n"); 55 | if (size > BUFFER_SIZE - buf->offset[file]) 56 | write_flush(buf, file); 57 | } 58 | -------------------------------------------------------------------------------- /util/write_simple.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "util.h" 4 | 5 | typedef struct writebuf_s { 6 | // private 7 | FILE **files; 8 | u8 **buffer; 9 | i32 *offset; 10 | } writebuf_t; 11 | 12 | 13 | writebuf_t wbuf_init(FILE **files, i32 num_files) { 14 | writebuf_t *buf; 15 | MALLOC(buf, sizeof(writebuf_t)); 16 | buf->files = files; 17 | MALLOC(buf->buffer, sizeof(u8*) * num_files); 18 | MALLOC(buf->offset, sizeof(i32) * num_files); 19 | for (i32 i = 0; i < num_files; i++) { 20 | buf->offset[i] = 0; 21 | MALLOC(buf->buffer[i], BUFFER_SIZE); 22 | } 23 | return *buf; 24 | } 25 | 26 | inlined void write_flush(writebuf_t *buf, i32 file) { 27 | if (buf->offset[file]) { 28 | FWRITE(buf->buffer[file], buf->offset[file], buf->files[file]); 29 | buf->offset[file] = 0; 30 | } 31 | } 32 | 33 | inlined void write_bytes(writebuf_t *buf, u8 *bytes, i32 size, i32 file) { 34 | ASSERT(size <= BUFFER_SIZE, "fatal: cant write more than BUFFER_SIZE\n"); 35 | if (size > BUFFER_SIZE - buf->offset[file]) 36 | write_flush(buf, file); 37 | memcpy(buf->buffer[file] + buf->offset[file], bytes, size); 38 | buf->offset[file] += size; 39 | } 40 | -------------------------------------------------------------------------------- /vendor/heap.h: -------------------------------------------------------------------------------- 1 | // license: mit 2 | /* from: https://github.com/robin-thomas/min-heap/blob/a1a8d7137f3afdf2b5ebf93b9d4059c4d1dd96e8/minHeap.c */ 3 | 4 | #pragma once 5 | 6 | #include "util.h" 7 | #include 8 | #include 9 | 10 | #ifndef HEAP_COMPARE 11 | #define HEAP_COMPARE(meta, x, y) compare(meta, x, y) > 0 12 | #endif 13 | 14 | #define HEAP_LCHILD(x) 2 * x + 1 15 | #define HEAP_RCHILD(x) 2 * x + 2 16 | #define HEAP_PARENT(x) (x - 1) / 2 17 | 18 | typedef struct heap_s { 19 | u16 meta; 20 | i32 size; 21 | u8 **nodes; 22 | } heap_t; 23 | 24 | void heap_swap(u8 **n1, u8 **n2) { 25 | u8* temp = *n1; 26 | *n1 = *n2; 27 | *n2 = temp; 28 | } 29 | 30 | void heap_heapify(heap_t *h, i32 i) { 31 | i32 smallest = (HEAP_LCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_LCHILD(i)], h->nodes[i])) ? HEAP_LCHILD(i) : i; 32 | if(HEAP_RCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_RCHILD(i)], h->nodes[smallest])) 33 | smallest = HEAP_RCHILD(i); 34 | if(smallest != i) { 35 | heap_swap(&(h->nodes[i]), &(h->nodes[smallest])); 36 | heap_heapify(h, smallest); 37 | } 38 | } 39 | 40 | void heap_insert(heap_t *h, u8 *data) { 41 | if(h->size) 42 | h->nodes = realloc(h->nodes, (h->size + 1) * sizeof(u8*)); 43 | else 44 | h->nodes = malloc(sizeof(u8*)); 45 | i32 i = (h->size)++; 46 | while(i && HEAP_COMPARE(h->meta, data, h->nodes[HEAP_PARENT(i)])) { 47 | h->nodes[i] = h->nodes[HEAP_PARENT(i)]; 48 | i = HEAP_PARENT(i); 49 | } 50 | h->nodes[i] = data; 51 | } 52 | 53 | void heap_delete(heap_t *h) { 54 | if(h->size) { 55 | h->nodes[0] = h->nodes[--(h->size)]; 56 | h->nodes = realloc(h->nodes, h->size * sizeof(u8*)); 57 | heap_heapify(h, 0); 58 | } 59 | } 60 | 61 | void heapify(heap_t *h, i32 i) { 62 | i32 smallest = (HEAP_LCHILD(i) < h->size && h->nodes[HEAP_LCHILD(i)] < h->nodes[i]) ? HEAP_LCHILD(i) : i; 63 | if(HEAP_RCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_RCHILD(i)], h->nodes[smallest])) { 64 | smallest = HEAP_RCHILD(i); 65 | } 66 | if(smallest != i) { 67 | heap_swap(&(h->nodes[i]), &(h->nodes[smallest])); 68 | heapify(h, smallest); 69 | } 70 | } 71 | 72 | void heap_truncate(heap_t *h, i32 size) { 73 | if (h->size <= size) 74 | return; 75 | heap_t h2 = {0}; 76 | h2.meta = h->meta; 77 | for (i32 i = 0; i < size; i++) { 78 | heap_insert(&h2, h->nodes[0]); 79 | heap_delete(h); 80 | } 81 | while (h->size) { 82 | free(h->nodes[0]); 83 | heap_delete(h); 84 | } 85 | h->nodes = h2.nodes; 86 | h->size = size; 87 | } 88 | 89 | void heap_free(heap_t *h) { 90 | free(h->nodes); 91 | } 92 | -------------------------------------------------------------------------------- /vendor/xxh3.h: -------------------------------------------------------------------------------- 1 | /* source: https://github.com/Cyan4973/xxHash/blob/a9054f397d7f41bc505638df3853b270eb9e7493/xxh3.h */ 2 | /* 3 | * xxHash - Extremely Fast Hash algorithm 4 | * Development source file for `xxh3` 5 | * Copyright (C) 2019-2020 Yann Collet 6 | * 7 | * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions are 11 | * met: 12 | * 13 | * * Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * * Redistributions in binary form must reproduce the above 16 | * copyright notice, this list of conditions and the following disclaimer 17 | * in the documentation and/or other materials provided with the 18 | * distribution. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | * 32 | * You can contact the author at: 33 | * - xxHash homepage: https://www.xxhash.com 34 | * - xxHash source repository: https://github.com/Cyan4973/xxHash 35 | */ 36 | 37 | /* 38 | * Note: This file used to host the source code of XXH3_* variants. 39 | * during the development period. 40 | * The source code is now properly integrated within xxhash.h. 41 | * 42 | * xxh3.h is no longer useful, 43 | * but it is still provided for compatibility with source code 44 | * which used to include it directly. 45 | * 46 | * Programs are now highly discouraged to include xxh3.h. 47 | * Include `xxhash.h` instead, which is the officially supported interface. 48 | * 49 | * In the future, xxh3.h will start to generate warnings, then errors, 50 | * then it will be removed from source package and from include directory. 51 | */ 52 | 53 | /* Simulate the same impact as including the old xxh3.h source file */ 54 | 55 | #define XXH_INLINE_ALL 56 | #include "xxhash.h" 57 | --------------------------------------------------------------------------------