├── .gitignore
├── Dockerfile.alpine
├── Dockerfile.debian
├── Makefile
├── bsv.py
├── experiments
    ├── bcut
    │   ├── bcut.c
    │   ├── bcut.go
    │   ├── bcut.rs
    │   └── readme.md
    ├── cut
    │   ├── cut.c
    │   ├── cut.go
    │   ├── cut.py
    │   ├── cut.rs
    │   └── readme.md
    └── readme.md
├── license.txt
├── readme.md
├── scripts
    ├── auto_reload.sh
    ├── install_archlinux.sh
    ├── makefile.sh
    ├── update_readme.py
    └── version.sh
├── setup.py
├── src
    ├── _bcopy.c
    ├── _bcopyraw.c
    ├── _copy.c
    ├── _csv.c
    ├── _gen_bsv.c
    ├── _gen_csv.c
    ├── _queue.c
    ├── bcat.c
    ├── bcombine.c
    ├── bcounteach.c
    ├── bcounteach_hash.c
    ├── bcountrows.c
    ├── bcut.c
    ├── bdedupe.c
    ├── bdedupe_hash.c
    ├── bdropuntil.c
    ├── bhead.c
    ├── blz4.c
    ├── blz4d.c
    ├── bmerge.c
    ├── bpartition.c
    ├── bquantile_merge.c
    ├── bquantile_sketch.c
    ├── bschema.c
    ├── bsort.c
    ├── bsplit.c
    ├── bsum.c
    ├── bsumeach.c
    ├── bsumeach_hash.c
    ├── bsv.c
    ├── btake.c
    ├── btakeuntil.c
    ├── btopn.c
    ├── bunzip.c
    ├── bzip.c
    ├── csv.c
    └── xxh3.c
├── test
    ├── _csv_test.py
    ├── _queue_test.py
    ├── bcat_test.py
    ├── bcombine_test.py
    ├── bcounteach_hash_test.py
    ├── bcounteach_test.py
    ├── bcountrows_test.py
    ├── bcut_test.py
    ├── bdedupe_hash_test.py
    ├── bdedupe_test.py
    ├── bdropuntil_i64_test.py
    ├── bdropuntil_test.py
    ├── blz4d_test.py
    ├── bmerge_test.py
    ├── bpartition_lz4_test.py
    ├── bpartition_test.py
    ├── bquantile_test.py
    ├── brmerge_test.py
    ├── brsort_f64_test.py
    ├── brsort_i64_test.py
    ├── brsort_test.py
    ├── brtopn_test.py
    ├── bschema_test.py
    ├── bsort_f64_test.py
    ├── bsort_i64_test.py
    ├── bsort_test.py
    ├── bsplit_test.py
    ├── bsum_test.py
    ├── bsumeach_f64_test.py
    ├── bsumeach_hash_i64_test.py
    ├── bsumeach_hash_test.py
    ├── bsumeach_i64_test.py
    ├── bsv_test.py
    ├── btake_test.py
    ├── btakeuntil_i64_test.py
    ├── btakeuntil_test.py
    ├── btopn_i64_test.py
    ├── btopn_test.py
    ├── bunzip_lz4_test.py
    ├── bunzip_test.py
    ├── bzip_lz4_test.py
    ├── bzip_test.py
    ├── csv_test.py
    ├── test_util.py
    └── xxh3_test.py
├── tox.ini
├── util
    ├── array.h
    ├── csv.h
    ├── dump.h
    ├── load.h
    ├── map.h
    ├── queue.h
    ├── read.h
    ├── read_ahead.h
    ├── read_simple.h
    ├── row.h
    ├── util.h
    ├── write.h
    └── write_simple.h
└── vendor
    ├── argh.h
    ├── ddsketch.h
    ├── heap.h
    ├── lz4.c
    ├── lz4.h
    ├── sort.h
    ├── xxh3.h
    └── xxhash.h


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.gcda
  2 | Cargo.lock
  3 | target/
  4 | .ccls*
  5 | .tox/
  6 | .hypothesis/
  7 | __pycache__/
  8 | .cache/
  9 | /todo*/
 10 | bsv
 11 | csv
 12 | rcut
 13 | _csv
 14 | _gen_csv
 15 | _read
 16 | _write
 17 | bucket
 18 | bdedupe
 19 | bin/bcut
 20 | bcut_rust
 21 | bcut_go
 22 | bcut_c
 23 | cut_rust
 24 | cut_go
 25 | cut_c
 26 | bbucket
 27 | bcounteach
 28 | bsort
 29 | bdisjoint
 30 | bdropuntil
 31 | _copy
 32 | _gen_csv_c
 33 | xxh3
 34 | bsplit
 35 | bpartition
 36 | bcat
 37 | btake
 38 | btakeuntil
 39 | sums
 40 | bsum
 41 | brsort
 42 | bmerge
 43 | brmerge
 44 | dist/
 45 | build/
 46 | bcountrows
 47 | bsv_plain
 48 | bcopy
 49 | bsv_ascii
 50 | csv_ascii
 51 | */psv_*/psv
 52 | */psv_*/csv
 53 | bschema
 54 | _gen_bsv
 55 | bsumeach
 56 | bsumeachu64
 57 | bsumu64
 58 | bsumeachf64
 59 | bunzip
 60 | bzip
 61 | bpartitionlz4
 62 | bcompress
 63 | bdecompress
 64 | blz4
 65 | blz4d
 66 | bunziplz4
 67 | bziplz4
 68 | bcatlz4
 69 | bmergelz4
 70 | brmergelz4
 71 | bcounteachhash
 72 | bsumeachhashu64
 73 | bcounteach_hash
 74 | bmerge_lz4
 75 | bpartition_lz4
 76 | brmerge_lz4
 77 | bsumeach_f64
 78 | bsumeach_hash_u64
 79 | bsumeach_u64
 80 | bsum_u64
 81 | bunzip_lz4
 82 | bzip_lz4
 83 | bcounteach-hash
 84 | bmerge-lz4
 85 | bpartition-lz4
 86 | brmerge-lz4
 87 | bsumeach-f64
 88 | bsumeach-hash-u64
 89 | bsumeach-u64
 90 | bsum-u64
 91 | bunzip-lz4
 92 | bzip-lz4
 93 | -copy
 94 | -csv
 95 | -gen-bsv
 96 | -gen-csv
 97 | bcat-lz4
 98 | bsumeach-hash-f64
 99 | bsort-f64
100 | brsort-f64
101 | bsort-u64
102 | bsort-i64
103 | bsumeach-hash-i64
104 | bsumeach-i64
105 | bsum-i64
106 | brsort-i64
107 | bcut
108 | bsumeach-hash
109 | bcombine
110 | version.h
111 | bhead
112 | btail
113 | bdedupe-hash
114 | btopn
115 | bquantile-merge
116 | bquantile-sketch
117 | _bcopy
118 | _bcopyraw
119 | _queue
120 | 


--------------------------------------------------------------------------------
/Dockerfile.alpine:
--------------------------------------------------------------------------------
 1 | FROM alpine:edge
 2 | 
 3 | RUN apk update && apk upgrade && apk add \
 4 |     python3-dev \
 5 |     py3-numpy \
 6 |     coreutils \
 7 |     make \
 8 |     bash \
 9 |     curl \
10 |     git \
11 |     rsync \
12 |     musl-dev \
13 |     gcc
14 | 
15 | RUN python3 -m ensurepip
16 | 
17 | RUN python3 -m pip install \
18 |     git+https://github.com/nathants/py-util \
19 |     git+https://github.com/nathants/py-shell \
20 |     git+https://github.com/nathants/py-pool \
21 |     git+https://github.com/nathants/cffi-xxh3 \
22 |     hypothesis \
23 |     pytest \
24 |     pytest-timeout \
25 |     pytest-xdist
26 | 


--------------------------------------------------------------------------------
/Dockerfile.debian:
--------------------------------------------------------------------------------
 1 | FROM debian:testing
 2 | 
 3 | RUN apt update && apt upgrade -y && apt install -y \
 4 |     python3-pip \
 5 |     python3-numpy \
 6 |     make \
 7 |     git \
 8 |     rsync \
 9 |     build-essential \
10 |     gcc
11 | 
12 | RUN python3 -m pip install \
13 |     git+https://github.com/nathants/py-util \
14 |     git+https://github.com/nathants/py-shell \
15 |     git+https://github.com/nathants/py-pool \
16 |     git+https://github.com/nathants/cffi-xxh3 \
17 |     hypothesis \
18 |     pytest \
19 |     pytest-timeout \
20 |     pytest-xdist
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: all clean test
  2 | CFLAGS=${CC_EXTRA} -Wno-int-conversion -Wno-incompatible-pointer-types -Wno-discarded-qualifiers -Iutil -Ivendor -flto -O3 -march=native -mtune=native -lm
  3 | ALL=clean docs _bcopy _bcopyraw _copy _csv _gen_bsv _gen_csv _queue bcat bcombine bcounteach bcounteach-hash bcountrows bcut bdedupe bdedupe-hash bdropuntil bhead blz4 blz4d bmerge bpartition bquantile-merge bquantile-sketch bschema bsort bsplit bsum bsumeach bsumeach-hash bsv btake btakeuntil btopn bunzip bzip csv xxh3
  4 | 
  5 | all: $(ALL)
  6 | 
  7 | setup:
  8 | 	mkdir -p bin
  9 | 	./scripts/version.sh &>/dev/null
 10 | 
 11 | clean: setup
 12 | 	cd bin && rm -f -- * *.*
 13 | 
 14 | docs:
 15 | 	./scripts/update_readme.py
 16 | 
 17 | test: setup
 18 | 	tox
 19 | 
 20 | _bcopy: setup
 21 | 	gcc vendor/lz4.c src/_bcopy.c -o bin/_bcopy $(CFLAGS)
 22 | 
 23 | _bcopyraw: setup
 24 | 	gcc vendor/lz4.c src/_bcopyraw.c -o bin/_bcopyraw $(CFLAGS)
 25 | 
 26 | _copy: setup
 27 | 	gcc vendor/lz4.c src/_copy.c -o bin/_copy $(CFLAGS)
 28 | 
 29 | _csv: setup
 30 | 	gcc vendor/lz4.c src/_csv.c -o bin/_csv $(CFLAGS)
 31 | 
 32 | _gen_bsv: setup
 33 | 	gcc vendor/lz4.c src/_gen_bsv.c -o bin/_gen_bsv $(CFLAGS)
 34 | 
 35 | _gen_csv: setup
 36 | 	gcc vendor/lz4.c src/_gen_csv.c -o bin/_gen_csv $(CFLAGS)
 37 | 
 38 | _queue: setup
 39 | 	gcc vendor/lz4.c src/_queue.c -o bin/_queue $(CFLAGS)
 40 | 
 41 | bcat: setup
 42 | 	gcc vendor/lz4.c src/bcat.c -o bin/bcat $(CFLAGS)
 43 | 
 44 | bcombine: setup
 45 | 	gcc vendor/lz4.c src/bcombine.c -o bin/bcombine $(CFLAGS)
 46 | 
 47 | bcounteach: setup
 48 | 	gcc vendor/lz4.c src/bcounteach.c -o bin/bcounteach $(CFLAGS)
 49 | 
 50 | bcounteach-hash: setup
 51 | 	gcc vendor/lz4.c src/bcounteach_hash.c -o bin/bcounteach-hash $(CFLAGS)
 52 | 
 53 | bcountrows: setup
 54 | 	gcc vendor/lz4.c src/bcountrows.c -o bin/bcountrows $(CFLAGS)
 55 | 
 56 | bcut: setup
 57 | 	gcc vendor/lz4.c src/bcut.c -o bin/bcut $(CFLAGS)
 58 | 
 59 | bdedupe: setup
 60 | 	gcc vendor/lz4.c src/bdedupe.c -o bin/bdedupe $(CFLAGS)
 61 | 
 62 | bdedupe-hash: setup
 63 | 	gcc vendor/lz4.c src/bdedupe_hash.c -o bin/bdedupe-hash $(CFLAGS)
 64 | 
 65 | bdropuntil: setup
 66 | 	gcc vendor/lz4.c src/bdropuntil.c -o bin/bdropuntil $(CFLAGS)
 67 | 
 68 | bhead: setup
 69 | 	gcc vendor/lz4.c src/bhead.c -o bin/bhead $(CFLAGS)
 70 | 
 71 | blz4: setup
 72 | 	gcc vendor/lz4.c src/blz4.c -o bin/blz4 $(CFLAGS)
 73 | 
 74 | blz4d: setup
 75 | 	gcc vendor/lz4.c src/blz4d.c -o bin/blz4d $(CFLAGS)
 76 | 
 77 | bmerge: setup
 78 | 	gcc vendor/lz4.c src/bmerge.c -o bin/bmerge $(CFLAGS)
 79 | 
 80 | bpartition: setup
 81 | 	gcc vendor/lz4.c src/bpartition.c -o bin/bpartition $(CFLAGS)
 82 | 
 83 | bquantile-merge: setup
 84 | 	gcc vendor/lz4.c src/bquantile_merge.c -o bin/bquantile-merge $(CFLAGS)
 85 | 
 86 | bquantile-sketch: setup
 87 | 	gcc vendor/lz4.c src/bquantile_sketch.c -o bin/bquantile-sketch $(CFLAGS)
 88 | 
 89 | bschema: setup
 90 | 	gcc vendor/lz4.c src/bschema.c -o bin/bschema $(CFLAGS)
 91 | 
 92 | bsort: setup
 93 | 	gcc vendor/lz4.c src/bsort.c -o bin/bsort $(CFLAGS)
 94 | 
 95 | bsplit: setup
 96 | 	gcc vendor/lz4.c src/bsplit.c -o bin/bsplit $(CFLAGS)
 97 | 
 98 | bsum: setup
 99 | 	gcc vendor/lz4.c src/bsum.c -o bin/bsum $(CFLAGS)
100 | 
101 | bsumeach: setup
102 | 	gcc vendor/lz4.c src/bsumeach.c -o bin/bsumeach $(CFLAGS)
103 | 
104 | bsumeach-hash: setup
105 | 	gcc vendor/lz4.c src/bsumeach_hash.c -o bin/bsumeach-hash $(CFLAGS)
106 | 
107 | bsv: setup
108 | 	gcc vendor/lz4.c src/bsv.c -o bin/bsv $(CFLAGS)
109 | 
110 | btake: setup
111 | 	gcc vendor/lz4.c src/btake.c -o bin/btake $(CFLAGS)
112 | 
113 | btakeuntil: setup
114 | 	gcc vendor/lz4.c src/btakeuntil.c -o bin/btakeuntil $(CFLAGS)
115 | 
116 | btopn: setup
117 | 	gcc vendor/lz4.c src/btopn.c -o bin/btopn $(CFLAGS)
118 | 
119 | bunzip: setup
120 | 	gcc vendor/lz4.c src/bunzip.c -o bin/bunzip $(CFLAGS)
121 | 
122 | bzip: setup
123 | 	gcc vendor/lz4.c src/bzip.c -o bin/bzip $(CFLAGS)
124 | 
125 | csv: setup
126 | 	gcc vendor/lz4.c src/csv.c -o bin/csv $(CFLAGS)
127 | 
128 | xxh3: setup
129 | 	gcc vendor/lz4.c src/xxh3.c -o bin/xxh3 $(CFLAGS)
130 | 
131 | 


--------------------------------------------------------------------------------
/bsv.py:
--------------------------------------------------------------------------------
 1 | from typing import Generator, Sequence, IO
 2 | import struct
 3 | import io
 4 | 
 5 | u16 = 'H'
 6 | i32 = 'i'
 7 | sizeof_i32 = 4
 8 | sizeof_u16 = 2
 9 | buffer_size = 1024 * 1024 * 5
10 | 
11 | def load(f: IO[bytes]) -> Generator[Sequence[bytes], None, None]:
12 |     # read chunk header to get size of chunk
13 |     while True:
14 |         data = f.read(sizeof_i32)
15 |         if len(data) == 0:
16 |             break
17 |         elif len(data) == sizeof_i32:
18 |             # read chunk
19 |             chunk_size = struct.unpack(i32, data)[0]
20 |             buffer = f.read(chunk_size)
21 |             assert len(buffer) == chunk_size, [len(buffer), chunk_size]
22 |             # buffer = io.BytesIO(buffer)
23 |             offset = 0
24 |             while True:
25 |                 # maybe read max index
26 |                 data = buffer[offset:offset + sizeof_u16]
27 |                 offset += sizeof_u16
28 |                 assert len(data) in {0, sizeof_u16}
29 |                 if len(data) != sizeof_u16:
30 |                     break
31 |                 max = struct.unpack(u16, data)[0]
32 |                 # read sizes
33 |                 size = (max + 1) * sizeof_u16
34 |                 data = buffer[offset:offset + size]
35 |                 offset += size
36 |                 assert len(data) == size
37 |                 sizes = [struct.unpack(u16, data[i * sizeof_u16:i * sizeof_u16 + sizeof_u16])[0] for i in range(size // sizeof_u16)]
38 |                 # read value bytes
39 |                 vals = []
40 |                 for size in sizes:
41 |                     data = buffer[offset:offset + size]
42 |                     offset += size
43 |                     assert len(data) == size
44 |                     assert buffer[offset:offset + 1] == b'\0'
45 |                     offset += 1
46 |                     vals.append(data)
47 |                 yield vals
48 |         else:
49 |             assert False
50 | 
51 | def dump(f: IO[bytes], xss: Sequence[Sequence[bytes]]) -> None:
52 |     buffer = io.BytesIO()
53 |     for xs in xss:
54 |         # write max index
55 |         assert sizeof_u16 == buffer.write(struct.pack(u16, len(xs) - 1))
56 |         # write sizes
57 |         for x in xs:
58 |             assert sizeof_u16 == buffer.write(struct.pack(u16, len(x)))
59 |         # write vals
60 |         for x in xs:
61 |             assert len(x) == buffer.write(x)
62 |             assert 1 == buffer.write(b'\0')
63 |     assert sizeof_i32 == f.write(struct.pack(i32, len(buffer.getvalue())))
64 |     assert len(buffer.getvalue()) < buffer_size, f'you cant dump more than {buffer_size} bytes at a time'
65 |     assert len(buffer.getvalue()) == f.write(buffer.getvalue())
66 | 


--------------------------------------------------------------------------------
/experiments/bcut/bcut.c:
--------------------------------------------------------------------------------
 1 | #include "load.h"
 2 | #include "write_simple.h"
 3 | 
 4 | int main(int argc, const char **argv) {
 5 | 
 6 |     // setup bsv
 7 |     SIGPIPE_HANDLER();
 8 |     INVARIANTS();
 9 |     INCREASE_PIPE_SIZES();
10 | 
11 |     // setup input
12 |     FILE *in_files[1] = {stdin};
13 |     readbuf_t rbuf;
14 |     rbuf_init(&rbuf, in_files, 1, false);
15 | 
16 |     // setup output
17 |     FILE *out_files[1] = {stdout};
18 |     writebuf_t wbuf;
19 |     wbuf_init(&wbuf, out_files, 1);
20 | 
21 |     // setup state
22 |     row_t row;
23 |     char *f;
24 |     char *fs;
25 |     i32 field;
26 |     i32 num_fields=0;
27 |     i32 field_nums[MAX_COLUMNS];
28 | 
29 |     // parse args
30 |     fs = (char*)argv[1];
31 |     while ((f = strsep(&fs, ","))) {
32 |         field = atoi(f);
33 |         field_nums[num_fields++] = field - 1;
34 |         ASSERT(field <= MAX_COLUMNS, "fatal: cannot select fields above %d, tried to select: %d\n", MAX_COLUMNS, field);
35 |         ASSERT(field >= 1, "fatal: fields must be positive, got: %d", field);
36 |         ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d fields\n", MAX_COLUMNS);
37 |     }
38 | 
39 |     // process input row by row
40 |     while (1) {
41 |         load_next(&rbuf, &row, 0);
42 |         if (row.stop)
43 |             break;
44 | 
45 |         i32 j = 0;
46 |         for (i32 i = 0; i < num_fields; i++) {
47 |             field = field_nums[i];
48 |             write_bytes(&wbuf, row.columns[field], row.sizes[field], 0);
49 |             if (++j < num_fields) {
50 |                 write_bytes(&wbuf, ",", 1, 0);
51 |             }
52 |         }
53 |         write_bytes(&wbuf, "\n", 1, 0);
54 | 
55 |     }
56 |     write_flush(&wbuf, 0);
57 | }
58 | 


--------------------------------------------------------------------------------
/experiments/bcut/bcut.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"encoding/binary"
 6 | 	"io"
 7 | 	"os"
 8 | 	"strconv"
 9 | 	"strings"
10 | )
11 | 
12 | func main() {
13 | 	var fields []int
14 | 	for _, x := range strings.Split(os.Args[1], ",") {
15 | 		x, err := strconv.Atoi(x)
16 | 		if err != nil {
17 | 			panic(err)
18 | 		}
19 | 		fields = append(fields, x-1)
20 | 	}
21 | 	max := int32(0)
22 | 	sizes := make([]int32, 1<<16)
23 | 	offsets := make([]int32, 1<<16)
24 | 	r := bufio.NewReader(os.Stdin)
25 | 	w := bufio.NewWriter(os.Stdout)
26 | 	defer w.Flush()
27 | 	// buffer4 := make([]byte, 4)
28 | 	chunk_offset := int32(0)
29 | 	chunk_size := int32(0)
30 | 	chunk_buffer := make([]byte, 1024*1024*5)
31 | 	for {
32 | 
33 | 		// read chunk size
34 | 		err := binary.Read(r, binary.LittleEndian, &chunk_size)
35 | 		if err != nil {
36 | 			break
37 | 		}
38 | 
39 | 		// read chunk
40 | 		_, err = io.ReadFull(r, chunk_buffer[:chunk_size])
41 | 		if err != nil {
42 | 			panic(err)
43 | 		}
44 | 
45 | 		// read all rows in chunk
46 | 		chunk_offset = 0
47 | 		for chunk_offset < chunk_size {
48 | 			// read row max
49 | 			max = int32(binary.LittleEndian.Uint16(chunk_buffer[chunk_offset:]))
50 | 			chunk_offset += 2
51 | 
52 | 			// read row sizes
53 | 			for i := int32(0); i <= max; i++ {
54 | 				sizes[i] = int32(binary.LittleEndian.Uint16(chunk_buffer[chunk_offset:]))
55 | 				chunk_offset += 2
56 | 			}
57 | 
58 | 			// setup row offsets
59 | 			for i := int32(0); i <= max; i++ {
60 | 				offsets[i] = chunk_offset
61 | 				chunk_offset += sizes[i] + 1
62 | 			}
63 | 
64 | 			// handle row
65 | 			for i, f := range fields {
66 | 				_, err = w.Write(chunk_buffer[offsets[f] : offsets[f]+sizes[f]])
67 | 				if err != nil {
68 | 					panic(err)
69 | 				}
70 | 				if i != len(fields)-1 {
71 | 					_, err = w.Write([]byte(","))
72 | 					if err != nil {
73 | 						panic(err)
74 | 					}
75 | 				}
76 | 			}
77 | 			_, err = w.Write([]byte("\n"))
78 | 			if err != nil {
79 | 				panic(err)
80 | 			}
81 | 		}
82 | 	}
83 | 
84 | }
85 | 


--------------------------------------------------------------------------------
/experiments/bcut/bcut.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{stdin, stdout, BufReader, BufWriter, Write, Read};
 2 | use std::env::args;
 3 | 
 4 | const MAX_COLUMNS: usize = 1 << 16;
 5 | 
 6 | #[inline]
 7 | fn read_i32(buf: &[u8]) -> usize {
 8 |     let mut out: usize = 0;
 9 |     let ptr_out = &mut out as *mut usize as *mut u8;
10 |     unsafe {
11 |         std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr_out, 4);
12 |     }
13 |     out
14 | }
15 | 
16 | 
17 | #[inline]
18 | fn read_u16(buf: &[u8]) -> usize {
19 |     let mut out: usize = 0;
20 |     let ptr_out = &mut out as *mut usize as *mut u8;
21 |     unsafe {
22 |         std::ptr::copy_nonoverlapping(buf.as_ptr(), ptr_out, 2);
23 |     }
24 |     out
25 | }
26 | 
27 | fn main() {
28 | 
29 |     // parse args
30 |     let fields: Vec<String> = args().collect();
31 |     assert!(fields.len() == 2, "usage: bcut field1,field2,fieldN,...");
32 |     let fields: Vec<usize> = fields[1]
33 |         .split(",")
34 |         .map(|x| x.parse::<i32>().unwrap())
35 |         .map(|x| { assert!(x > 0 && x < MAX_COLUMNS as i32); (x - 1) as usize})
36 |         .collect();
37 | 
38 |     // setup io
39 |     let mut reader = BufReader::with_capacity(1024 * 512, stdin());
40 |     let mut writer = BufWriter::with_capacity(1024 * 512, stdout());
41 |     let mut buffer4: [u8; 4] = [0; 4];
42 |     let mut chunk_offset: usize;
43 |     let mut chunk_buffer: [u8; 1024*1024*5] = [0; 1024*1024*5];
44 | 
45 |     // setup state
46 |     let mut max: usize;
47 |     let mut sizes: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS];
48 |     let mut offsets: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS];
49 | 
50 |     // process input line by line
51 |     while let Ok(_) = reader.read_exact(&mut buffer4) {
52 |         // read chunk size
53 |         let chunk_size = read_i32(&buffer4);
54 | 
55 |         // read next chunk
56 |         let mut chunk_buffer = &mut chunk_buffer[..chunk_size];
57 |         reader.read_exact(&mut chunk_buffer).unwrap();
58 | 
59 |         // read all rows in chunk
60 |         chunk_offset = 0;
61 |         while chunk_offset < chunk_size {
62 | 
63 |             // read row max
64 |             max = read_u16(&chunk_buffer[chunk_offset..]);
65 |             chunk_offset += 2;
66 | 
67 |             // read row sizes
68 |             for i in 0..max+1 {
69 |                 sizes[i] = read_u16(&chunk_buffer[chunk_offset..]);
70 |                 chunk_offset += 2;
71 |             }
72 | 
73 |             // setup row offsets
74 |             for i in 0..max+1 {
75 |                 offsets[i] = chunk_offset;
76 |                 chunk_offset += sizes[i] + 1;
77 |             }
78 | 
79 |             // handle row
80 |             let mut i = 0;
81 |             for field in &fields {
82 |                 assert!(*field <= max, "found a row without enough columns");
83 |                 let offset = offsets[*field];
84 |                 let size = sizes[*field];
85 |                 writer.write_all(&chunk_buffer[offset..offset+size]).unwrap();
86 |                 i += 1;
87 |                 if i < fields.len() {
88 |                     writer.write_all(&[b',']).unwrap();
89 |                 }
90 |             }
91 |             writer.write_all(&[b'\n']).unwrap();
92 | 
93 |         }
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/experiments/bcut/readme.md:
--------------------------------------------------------------------------------
  1 | ### experiments with alternate implementations of bcut
  2 | 
  3 | ##### ramfs
  4 | ```bash
  5 | cd /tmp
  6 | ```
  7 | 
  8 | ##### build bsv and put bin on PATH
  9 | ```bash
 10 | >> (cd ~/repos/bsv && make)
 11 | >> export PATH=$PATH:~/repos/bsv/bin
 12 | ```
 13 | 
 14 | ##### increase max pipe size to 5MB
 15 | ```bash
 16 | >> sudo sysctl fs.pipe-max-size=5242880
 17 | ```
 18 | 
 19 | ##### make sure we are dealing with bytes only
 20 | ```bash
 21 | >> export LC_ALL=C
 22 | ```
 23 | 
 24 | ##### make some csv
 25 | ```bash
 26 | >> time _gen_csv 8 25000000 >data.csv
 27 | real    0m7.360s
 28 | user    0m6.677s
 29 | sys     0m0.680s
 30 | ```
 31 | 
 32 | ##### convert it to bsv
 33 | ```bash
 34 | >> bsv <data.csv >data.bsv
 35 | >> time bsv <data.csv >/dev/null
 36 | real    0m5.115s
 37 | user    0m4.893s
 38 | sys     0m0.220s
 39 | ```
 40 | 
 41 | ##### see how well the data compresses
 42 | ```bash
 43 | >> time lz4 <data.csv >data.csv.lz4
 44 | real    0m5.135s
 45 | user    0m4.782s
 46 | sys     0m0.349s
 47 | 
 48 | >> time lz4 <data.bsv >data.bsv.lz4
 49 | real    0m6.876s
 50 | user    0m6.374s
 51 | sys     0m0.500s
 52 | ```
 53 | 
 54 | ##### check the sizes, bsv trades space for time
 55 | ```bash
 56 | >> ls -lh data.* | cut -d' ' -f5,9
 57 | 2.2G data.bsv
 58 | 1.1G data.bsv.lz4
 59 | 1.8G data.csv
 60 | 779M data.csv.lz4
 61 | ```
 62 | 
 63 | ##### copy the experiments and make sure they all get the same result
 64 | ```bash
 65 | >> cp ~/repos/bsv/experiments/bcut/* .
 66 | >> cp -r ~/repos/bsv/util .
 67 | >> cp -r ~/repos/bsv/vendor .
 68 | 
 69 | >> cut -d, -f3,7 <data.csv | xxh3
 70 | 9135bc839b1f6beb
 71 | 
 72 | >> go build -o bcut_go bcut.go
 73 | >> ./bcut_go 3,7 <data.bsv | xxh3
 74 | 9135bc839b1f6beb
 75 | 
 76 | >> rustc -O -o bcut_rust bcut.rs
 77 | >> ./bcut_rust 3,7 <data.bsv | xxh3
 78 | 9135bc839b1f6beb
 79 | 
 80 | >> gcc -Ivendor -Iutil -O3 -flto -march=native -mtune=native -o bcut_c bcut.c
 81 | >> ./bcut_c 3,7 <data.bsv | xxh3
 82 | 9135bc839b1f6beb
 83 | 
 84 | >> bcut 3,7 <data.bsv | csv | xxh3
 85 | 9135bc839b1f6beb
 86 | ```
 87 | 
 88 | ##### coreutils cut is a good baseline
 89 | ```bash
 90 | >> time cut -d, -f3,7 <data.csv >/dev/null
 91 | real    0m5.784s
 92 | user    0m5.472s
 93 | sys     0m0.311s
 94 | ```
 95 | 
 96 | ##### go is faster
 97 | ```bash
 98 | >> time ./bcut_go 3,7 <data.bsv >/dev/null
 99 | real    0m2.179s
100 | user    0m1.870s
101 | sys     0m0.312s
102 | ```
103 | 
104 | ##### rust is faster
105 | ```bash
106 | >> time ./bcut_rust 3,7 <data.bsv >/dev/null
107 | real    0m1.343s
108 | user    0m1.139s
109 | sys     0m0.203s
110 | ```
111 | 
112 | ##### c is faster
113 | ```bash
114 | >> time ./bcut_c 3,7 <data.bsv >/dev/null
115 | real    0m0.812s
116 | user    0m0.622s
117 | sys     0m0.189s
118 | ```
119 | 


--------------------------------------------------------------------------------
/experiments/cut/cut.c:
--------------------------------------------------------------------------------
 1 | #include "csv.h"
 2 | #include "write_simple.h"
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     // setup io
 6 |     CSV_INIT();
 7 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1);
 8 |     // setup state
 9 |     char *f;
10 |     char *fs;
11 |     int field;
12 |     int num_fields=0;
13 |     int field_nums[MAX_COLUMNS];
14 |     // parse args
15 |     fs = (char*)argv[1];
16 |     while ((f = strsep(&fs, ","))) {
17 |         field = atoi(f);
18 |         field_nums[num_fields++] = field - 1;
19 |         ASSERT(field <= MAX_COLUMNS, "fatal: cannot select fields above %d, tried to select: %d\n", MAX_COLUMNS, field);
20 |         ASSERT(field >= 1, "fatal: fields must be positive, got: %d", field);
21 |         ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d fields\n", MAX_COLUMNS);
22 |     }
23 |     // process input row by row
24 |     while (1) {
25 |         CSV_READ_LINE(stdin);
26 |         if (csv_stop)
27 |             break;
28 |         if (csv_max || csv_sizes[0]) {
29 |             int j = 0;
30 |             for (int i = 0; i < num_fields; i++) {
31 |                 ASSERT(field_nums[i] <= csv_max, "fatal: not enough columns\n");
32 |                 field = field_nums[i];
33 |                 write_bytes(&wbuf, csv_columns[field], csv_sizes[field], 0);
34 |                 if (++j < num_fields)
35 |                     write_bytes(&wbuf, ",", 1, 0);
36 |             }
37 |             write_bytes(&wbuf, "\n", 1, 0);
38 |         }
39 |     }
40 |     write_flush(&wbuf, 0);
41 | }
42 | 


--------------------------------------------------------------------------------
/experiments/cut/cut.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"io"
 6 | 	"os"
 7 | 	"strconv"
 8 | 	"strings"
 9 | )
10 | 
11 | func main() {
12 | 	var fields []int
13 | 	for _, x := range strings.Split(os.Args[1], ",") {
14 | 		x, err := strconv.Atoi(x)
15 | 		if err != nil {
16 | 			panic(err)
17 | 		}
18 | 		fields = append(fields, x-1)
19 | 	}
20 | 	starts := make([]int, 1<<16)
21 | 	ends := make([]int, 1<<16)
22 | 	r := bufio.NewReader(os.Stdin)
23 | 	w := bufio.NewWriter(os.Stdout)
24 | 	defer w.Flush()
25 | 	for {
26 | 		// read row
27 | 		line, isPrefix, err := r.ReadLine()
28 | 		if isPrefix {
29 | 			panic("line too long")
30 | 		}
31 | 		if err != nil {
32 | 			if err == io.EOF || err == io.ErrUnexpectedEOF {
33 | 				break
34 | 			}
35 | 			panic(err)
36 | 		}
37 | 		// parse row
38 | 		offset := 0
39 | 		max := 0
40 | 		for i := 0; i < len(line); i++ {
41 | 			switch line[i] {
42 | 			case byte(','):
43 | 				starts[max] = offset
44 | 				ends[max] = i
45 | 				offset = i + 1
46 | 				max += 1
47 | 			}
48 | 		}
49 | 		starts[max] = offset
50 | 		ends[max] = len(line)
51 | 		// handle row
52 | 		for i, f := range fields {
53 | 			_, err = w.Write(line[starts[f]:ends[f]])
54 | 			if err != nil {
55 | 				panic(err)
56 | 			}
57 | 			if i != len(fields)-1 {
58 | 				_, err = w.Write([]byte(","))
59 | 				if err != nil {
60 | 					panic(err)
61 | 				}
62 | 			}
63 | 		}
64 | 		_, err = w.Write([]byte("\n"))
65 | 		if err != nil {
66 | 			panic(err)
67 | 		}
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/experiments/cut/cut.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | fields = [int(x) - 1 for x in sys.argv[1].split(',')]
 4 | 
 5 | buffer_size = 1024 * 512
 6 | 
 7 | # row metadata
 8 | starts = [0 for _ in range(1 << 16)] # type: ignore
 9 | ends   = [0 for _ in range(1 << 16)] # type: ignore
10 | 
11 | # delimiters
12 | comma   = bytearray(b',')[0]
13 | newline = bytearray(b'\n')[0]
14 | 
15 | # write read_buffer
16 | write_buffer = bytearray(buffer_size)
17 | 
18 | while True:
19 |     # read read_buffer size
20 |     read_buffer = sys.stdin.buffer.read(buffer_size) # type: ignore
21 |     stop = len(read_buffer) != buffer_size
22 |     # on a full read, extend with the next full line so the read_buffer always ends with a newline
23 |     if len(read_buffer) == buffer_size:
24 |         read_buffer += sys.stdin.buffer.readline()
25 |     read_offset = 0
26 |     write_offset = 0
27 |     max = 0
28 |     # process read_buffer byte by byte
29 |     for i in range(len(read_buffer)):
30 |         # found the next column
31 |         if read_buffer[i] == comma:
32 |             starts[max] = read_offset
33 |             ends[max] = i
34 |             read_offset = i + 1
35 |             max += 1
36 |         # found the row end
37 |         elif read_buffer[i] == newline:
38 |             starts[max] = read_offset
39 |             ends[max] = i
40 |             read_offset = i + 1 # next row starts on the byte following the newline
41 |             # handle row
42 |             val = b''
43 |             for i, f in enumerate(fields):
44 |                 val += read_buffer[starts[f]:ends[f]]
45 |                 if i != len(fields) - 1:
46 |                     val += b','
47 |             val += b'\n'
48 |             # maybe flush and write
49 |             if len(val) > len(write_buffer) - write_offset:
50 |                 sys.stdout.buffer.write(write_buffer[:write_offset])
51 |                 write_offset = 0
52 |             write_buffer[write_offset:write_offset + len(val)] = val
53 |             write_offset += len(val)
54 |             # reset for next row
55 |             max = 0
56 |     # flush
57 |     sys.stdout.buffer.write(write_buffer[:write_offset])
58 |     if stop:
59 |         break
60 | 


--------------------------------------------------------------------------------
/experiments/cut/cut.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{stdin, stdout, BufReader, BufWriter, Write, BufRead};
 2 | use std::env::args;
 3 | 
 4 | const MAX_COLUMNS: usize = 1 << 16;
 5 | 
 6 | fn main() {
 7 |     // parse args
 8 |     let fields: Vec<String> = args().collect();
 9 |     assert!(fields.len() == 2, "usage: bcut field1,field2,fieldN,...");
10 |     let fields: Vec<usize> = fields[1]
11 |         .split(",")
12 |         .map(|x| x.parse::<i32>().unwrap())
13 |         .map(|x| { assert!(x > 0 && x < MAX_COLUMNS as i32); (x - 1) as usize})
14 |         .collect();
15 |     // setup io
16 |     let mut reader = BufReader::with_capacity(1024 * 512, stdin());
17 |     let mut writer = BufWriter::with_capacity(1024 * 512, stdout());
18 |     let mut buffer: Vec<u8> = Vec::new();
19 |     // setup state
20 |     let mut offsets: [usize; MAX_COLUMNS] = [0; MAX_COLUMNS];
21 |     let mut lens:    [usize; MAX_COLUMNS] = [0; MAX_COLUMNS];
22 |     // process input line by line
23 |     loop {
24 |         // read the next line into the buffer
25 |         buffer.clear();
26 |         match reader.read_until(b'\n', &mut buffer) {
27 |             Err(err) => std::panic!(err),
28 |             Ok(0) => break,
29 |             // process the current line
30 |             Ok(mut n) => {
31 |                 if buffer[n - 1] == b'\n' {
32 |                     n -= 1;
33 |                 }
34 |                 if n > 0 {
35 |                     // discover the fields of this row
36 |                     let mut max = 0;
37 |                     let mut offset = 0;
38 |                     for (i, part) in buffer[..n].split(|val| val == &b',').enumerate() {
39 |                         offsets[i] = offset;
40 |                         lens[i] = part.len();
41 |                         offset += part.len() + 1;
42 |                         max = i;
43 |                     }
44 |                     // output the chosen fields
45 |                     let mut i = 0;
46 |                     for field in &fields {
47 |                         assert!(*field <= max, "found a row without enough columns");
48 |                         let offset = offsets[*field];
49 |                         let len = lens[*field];
50 |                         writer.write_all(&buffer[..n][offset..offset+len]).unwrap();
51 |                         i += 1;
52 |                         if i < fields.len() {
53 |                             writer.write_all(&[b',']).unwrap();
54 |                         }
55 |                     }
56 |                     writer.write_all(&[b'\n']).unwrap();
57 |                 }
58 |             }
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/experiments/readme.md:
--------------------------------------------------------------------------------
 1 | ### tldr;
 2 | 
 3 | ```bash
 4 | >> export LC_ALL=C
 5 | >> time cut -d, -f3,7 <data.csv >/dev/null
 6 | real    0m5.784s
 7 | user    0m5.472s
 8 | sys     0m0.311s
 9 | ```
10 | 
11 | ```bash
12 | >> time bcut 3,7 <data.bsv >/dev/null
13 | real    0m1.010s
14 | user    0m0.729s
15 | sys     0m0.280s
16 | ```
17 | 
18 | ```bash
19 | >> time sort --parallel=1 -S50% -k1,1 <data.csv >/dev/null
20 | real    0m22.406s
21 | user    0m21.516s
22 | sys     0m0.880s
23 | ```
24 | 
25 | ```bash
26 | >> time bsort <data.bsv >/dev/null
27 | real    0m13.558s
28 | user    0m12.266s
29 | sys     0m1.139s
30 | ```
31 | 
32 | ```bash
33 | >> time sort -m -k1,1 -S50% csv.*.sorted >/dev/null
34 | real    0m8.846s
35 | user    0m6.692s
36 | sys     0m2.149s
37 | ```
38 | 
39 | ```bash
40 | >> time bmerge $(cat filenames.txt | while read path; do echo $path.sorted; done) >/dev/null
41 | real    0m1.361s
42 | user    0m0.911s
43 | sys     0m0.450s
44 | ```
45 | 
46 | ### alternate implementations and performance experiments
47 | 
48 | [cut](https://github.com/nathants/bsv/blob/master/experiments/cut/) in c, rust, go, and pypy
49 | 
50 | [bcut](https://github.com/nathants/bsv/blob/master/experiments/bcut/) in c, rust, go and pypy
51 | 
52 | [sort and merge](https://github.com/nathants/bsv/blob/master/experiments/cut/readme.md#the-only-random-access-that-should-ever-be-happening-is-sort) with bsv and coreutils
53 | 
54 | [linear scan](https://github.com/nathants/bsv/blob/master/experiments/cut/readme.md#if-you-have-sorted-data-you-can-drop-rows-before-a-given-value-efficiently) with bsv and grep
55 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018-present Nathan Todd-Stone
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/auto_reload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eou pipefail
 3 | 
 4 | name=$1
 5 | 
 6 | if ! which aws-ec2-new &>/dev/null; then
 7 |     echo fatal: need to install https://github.com/nathants/cli-aws
 8 |     exit 1
 9 | fi
10 | 
11 | cd $(dirname $(dirname $0))
12 | 
13 | # push code
14 | aws-ec2-rsync . :bsv/ $name -y
15 | 
16 | # reinstall bsv
17 | aws-ec2-ssh $name -yc "
18 |     cd ~/bsv
19 |     make -j && sudo mv -fv bin/* /usr/local/bin
20 | "
21 | 
22 | # kill any running reloaders
23 | aws-ec2-ssh $name -yc "(ps -ef | grep entr | grep make | grep -v grep | awk '{print \$2}' | xargs kill) || true"
24 | 
25 | # setup the remote reloader
26 | aws-ec2-ssh $name --no-tty -yc "
27 |     cd ~/bsv
28 |     ((find -type f -name '*.c' -o -name '*.h' | entr -r bash -c 'sudo rm -f /usr/local/bin/b* && make -j && sudo mv -fv bin/* /usr/local/bin') &> ~/bsv.log </dev/null) &
29 | "
30 | 
31 | # run the local file pusher
32 | find -type f -name '*.c' -o -name '*.h' | entr aws-ec2-rsync . :bsv/ $name -y
33 | 


--------------------------------------------------------------------------------
/scripts/install_archlinux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | sudo pacman --needed --noconfirm --noprogressbar -Sy \
 4 |      gcc \
 5 |      make
 6 | cd ~
 7 | rm -rf bsv
 8 | git clone https://github.com/nathants/bsv
 9 | cd bsv
10 | make -j
11 | sudo mv -fv bin/* /usr/local/bin
12 | 


--------------------------------------------------------------------------------
/scripts/makefile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eou pipefail
 3 | cd $(dirname $(dirname $(realpath $0)))
 4 | 
 5 | echo ".PHONY: all clean test" > Makefile
 6 | echo "CFLAGS=\${CC_EXTRA} -Wno-int-conversion -Wno-incompatible-pointer-types -Wno-discarded-qualifiers -Iutil -Ivendor -flto -O3 -march=native -mtune=native -lm" >> Makefile
 7 | echo ALL=clean docs $(for src in src/*.c; do
 8 |                     if basename $src | grep ^_ &>/dev/null; then
 9 |                         basename $src | cut -d. -f1
10 |                     else
11 |                         basename $src | cut -d. -f1 | tr '_' '-'
12 |                     fi
13 |                 done) >> Makefile
14 | echo >> Makefile
15 | 
16 | echo "all: \$(ALL)" >> Makefile
17 | echo >> Makefile
18 | 
19 | echo setup: >> Makefile
20 | echo -e '\tmkdir -p bin' >> Makefile
21 | echo -e '\t./scripts/version.sh &>/dev/null' >> Makefile
22 | echo >> Makefile
23 | 
24 | echo clean: setup >> Makefile
25 | echo -e '\tcd bin && rm -f -- * *.*' >> Makefile
26 | echo >> Makefile
27 | 
28 | echo docs: >> Makefile
29 | echo -e '\t./scripts/update_readme.py' >> Makefile
30 | echo >> Makefile
31 | 
32 | echo test: setup >> Makefile
33 | echo -e '\ttox' >> Makefile
34 | echo >> Makefile
35 | 
36 | for path in src/*.c; do
37 |     if basename $path | grep ^_ &>/dev/null; then
38 |         name=$(basename $path | cut -d. -f1)
39 |     else
40 |         name=$(basename $path | cut -d. -f1 | tr '_' '-')
41 |     fi
42 |     echo "$name: setup" >> Makefile
43 |     echo -e "\tgcc vendor/lz4.c $path -o bin/$name \$(CFLAGS)" >> Makefile
44 | 
45 |     echo >> Makefile
46 |     if ! cat .gitignore | grep ^$name$ &>/dev/null; then
47 |         echo $name >> .gitignore
48 |     fi
49 | done
50 | 


--------------------------------------------------------------------------------
/scripts/update_readme.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | import subprocess
 4 | 
 5 | os.chdir(os.path.dirname(os.path.dirname(__file__)))
 6 | co = lambda *a: subprocess.check_output(' '.join(map(str, a)), shell=True, executable='/bin/bash').decode('utf-8').strip()
 7 | 
 8 | with open('readme.md') as f:
 9 |     xs = f.read().splitlines()
10 | 
11 | before = []
12 | for x in xs:
13 |     before.append(x)
14 |     if x.startswith('## tools'):
15 |         before.append('')
16 |         break
17 | 
18 | before.append('| name | description |')
19 | before.append('| -- | -- |')
20 | 
21 | after = []
22 | 
23 | for path in co('ls src/*.c').splitlines():
24 |     name = path
25 |     if not path.split('/')[-1].startswith('_'):
26 |         with open(path) as f:
27 |             xs = f.read().splitlines()
28 |         try:
29 |             assert any(x.strip() == 'SETUP();' for x in xs), path
30 |             name = path.split('/')[-1].split('.c')[0]
31 |             description = [x for x in xs if x.startswith('#define DESCRIPTION')][0].replace('\\n', '\n').split('"')[1]
32 |             usage = [x for x in xs if x.startswith('#define USAGE')][0].replace('\\n', '\n').split('"')[1]
33 |             try:
34 |                 example = [x for x in xs if x.startswith('#define EXAMPLE')][0].replace('\\n', '\n').split('"')[1]
35 |             except IndexError:
36 |                 example = ''
37 |                 while True:
38 |                     x = xs.pop(0)
39 |                     if x.startswith('#define EXAMPLE'):
40 |                         break
41 |                 while True:
42 |                     x = xs.pop(0)
43 |                     if not x.strip('"'):
44 |                         break
45 |                     x = x.replace('\\n', '\n').split('"')[1]
46 |                     if x.strip():
47 |                         example += x
48 |         except:
49 |             print(f'fatal: failed to parse docs in file: {name}.c')
50 |             raise
51 |         if not name.startswith('_'):
52 |             name = name.replace('_', '-')
53 |         before.append(f'| [{name}](#{name}) | {description.rstrip()} |'.strip())
54 |         after.append(f'\n### [{name}](https://github.com/nathants/bsv/blob/master/src/{name.replace("-", "_")}.c)\n\n{description}```bash\nusage: {usage.strip()}\n```\n\n```bash\n{example.rstrip()}\n```')
55 | 
56 | with open('readme.md', 'w') as f:
57 |     f.write('\n'.join(before + after) + '\n')
58 | 


--------------------------------------------------------------------------------
/scripts/version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | cd $(dirname $(dirname $0))
 5 | 
 6 | hash=$(git log -1 --pretty=%H || echo -)
 7 | date=$(date -u +'%Y-%m-%dT%H:%M:%SZ')
 8 | 
 9 | if [ -z "$(git status --porcelain)" ]; then
10 |     devel=false
11 | else
12 |     devel=true
13 | fi
14 | 
15 | cat - <<EOF >util/version.h
16 | #define VERSION_GIT_HASH "git:      $hash"
17 | #define VERSION_DATE     "date:     $date"
18 | #define VERSION_DEVEL    "devel:    $devel"
19 | EOF
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | setuptools.setup(
 3 |     version="0.0.1",
 4 |     license='mit',
 5 |     name='bsv',
 6 |     author='nathan todd-stone',
 7 |     author_email='me@nathants.com',
 8 |     url='http://github.com/nathants/bsv',
 9 |     py_modules=['bsv'],
10 | )
11 | 


--------------------------------------------------------------------------------
/src/_bcopy.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "pass through data, to benchmark load/dump performance\n\n"
 6 | #define USAGE "... | bcopy \n\n"
 7 | #define EXAMPLE ">> echo a,b,c | bsv | _bcopy | csv\na,b,c\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     row_t row;
18 | 
19 |     // process input row by row
20 |     while (1) {
21 |         load_next(&rbuf, &row, 0);
22 |         if (row.stop)
23 |             break;
24 |         dump(&wbuf, &row, 0);
25 |     }
26 |     dump_flush(&wbuf, 0);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/_bcopyraw.c:
--------------------------------------------------------------------------------
 1 | #define READ_GROWING
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | #include "array.h"
 5 | #include "argh.h"
 6 | 
 7 | #define DESCRIPTION "pass through data, to benchmark raw load/dump performance\n\n"
 8 | #define USAGE "... | bcopy \n\n"
 9 | #define EXAMPLE ">> echo a,b,c | bsv | _bcopyraw | csv\na,b,c\n"
10 | 
11 | int main(int argc, char **argv) {
12 | 
13 |     // setup bsv
14 |     SETUP();
15 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // setup state
19 |     row_t row;
20 |     raw_row_t raw_row;
21 |     ARRAY_INIT(array, raw_row_t*);
22 | 
23 |     // read
24 |     while (1) {
25 |         load_next(&rbuf, &row, 0);
26 |         if (row.stop)
27 |             break;
28 |         row_to_raw(&row, &raw_row);
29 |         dump_raw(&wbuf, &raw_row, 0);
30 |     }
31 |     dump_flush(&wbuf, 0);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/_copy.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | int main(int argc, char **argv) {
 4 |     INCREASE_PIPE_SIZES();
 5 |     u8 *buff;
 6 |     i32 size = 1024 * 16;
 7 |     buff = malloc(size);
 8 |     i32 wbytes, rbytes;
 9 |     while (1) {
10 |         rbytes = fread_unlocked(buff, 1, size, stdin);
11 |         wbytes = fwrite_unlocked(buff, 1, rbytes, stdout);
12 |         ASSERT(wbytes == rbytes, "fatal: bad write\n");
13 |         if (rbytes != size)
14 |             break;
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/_csv.c:
--------------------------------------------------------------------------------
 1 | #include "csv.h"
 2 | #include "util.h"
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     SIGPIPE_HANDLER();
 6 |     CSV_INIT();
 7 |     while (1) {
 8 |         CSV_READ_LINE(stdin);
 9 |         if (csv_stop)
10 |             break;
11 |         for (int i = 0; i <= csv_max; i++) {
12 |             fwrite(csv_columns[i], sizeof(char), csv_sizes[i], stdout);
13 |             fwrite("\n", sizeof(char), 1, stdout);
14 |         }
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/_gen_bsv.c:
--------------------------------------------------------------------------------
 1 | #include <ctype.h>
 2 | #include <stdbool.h>
 3 | #include <time.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include "util.h"
 8 | #include "dump.h"
 9 | 
10 | void showusage() {
11 |     FPRINTF(stderr, "\nusage: $ _gen_bsv NUM_COLUMNS NUM_ROWS MAXNUM_CHARS [--bytes]\n");
12 |     exit(1);
13 | }
14 | 
15 | int main(int argc, char **argv) {
16 |     SIGPIPE_HANDLER();
17 |     if (argc < 4)
18 |         showusage();
19 |     i32 num_columns = atoi(argv[1]);
20 |     i64 num_rows = atol(argv[2]);
21 |     i32 max_chars = atoi(argv[3]) + 1;
22 |     ASSERT(num_columns >= 0, "fatal: num_columns < 0");
23 |     ASSERT(num_rows >= 0, "fatal: num_rows < 0");
24 |     bool bytes = false;
25 |     if (argc == 5 && strcmp(argv[4], "--bytes") == 0)
26 |         bytes = true;
27 | 
28 |     // setup bsv
29 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
30 | 
31 |     struct timespec ts;
32 |     clock_gettime(CLOCK_MONOTONIC, &ts);
33 |     srand(ts.tv_nsec);
34 | 
35 |     u8 *word;
36 |     i32 size;
37 |     i32 index;
38 |     row_t row;
39 |     u8 *buffer;
40 |     MALLOC(buffer, BUFFER_SIZE);
41 |     i32 i = 0;
42 |     i32 offset;
43 |     while (i++ < num_rows) {
44 |         offset = 0;
45 |         row.max = 0;
46 |         for (i32 j = 0; j < num_columns; j++) {
47 |             i32 num_chars = rand() % max_chars;
48 |             for (i32 k = 0; k < num_chars; k++) {
49 |                 i32 val = rand();
50 |                 if (bytes) {
51 |                     buffer[offset + k] = val;
52 |                 } else {
53 |                     if (val % 2) {
54 |                         buffer[offset + k] = (val % (122 - 97)) + 97; // a-z
55 |                     } else {
56 |                         buffer[offset + k] = (val % (57 - 48)) + 48; // 0-9
57 |                     }
58 |                 }
59 |             }
60 |             row.sizes[j] = num_chars;
61 |             row.columns[j] = buffer + offset;
62 |             offset += num_chars;
63 |             row.max = j;
64 | 
65 |         }
66 |         dump(&wbuf, &row, 0);
67 |     }
68 |     dump_flush(&wbuf, 0);
69 | }
70 | 


--------------------------------------------------------------------------------
/src/_gen_csv.c:
--------------------------------------------------------------------------------
 1 | #include <ctype.h>
 2 | #include <time.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include "util.h"
 7 | 
 8 | void showusage() {
 9 |     FPRINTF(stderr, "\nusage: $ gen-csv NUM_COLUMNS NUM_ROWS MAXNUM_CHARS\n");
10 |     exit(1);
11 | }
12 | 
13 | int main(int argc, char **argv) {
14 |     SIGPIPE_HANDLER();
15 |     if (argc < 4)
16 |         showusage();
17 |     i32 num_columns = atoi(argv[1]);
18 |     i64 num_rows = atol(argv[2]);
19 |     i32 max_chars = atoi(argv[3]) + 1;
20 |     ASSERT(num_columns >= 0, "fatal: num_columns < 0");
21 |     ASSERT(num_rows >= 0, "fatal: num_rows < 0");
22 |     i32 num_words, add_delimiter;
23 | 
24 |     struct timespec ts;
25 |     clock_gettime(CLOCK_MONOTONIC, &ts);
26 |     srand(ts.tv_nsec);
27 | 
28 |     i32 i = 0;
29 |     while (i++ < num_rows) {
30 |         add_delimiter = 0;
31 |         for (i32 j = 0; j < num_columns; j++) {
32 |             if (add_delimiter)
33 |                 FPUTS(",");
34 |             i32 num_chars = rand() % max_chars;
35 |             for (i32 i = 0; i < num_chars; i++) {
36 |                 i32 val = rand();
37 |                 char c[1];
38 |                 if (val % 2) {
39 |                     c[0] = (val % (122 - 97)) + 97; // a-z
40 |                 } else {
41 |                     c[0] = (val % (57 - 48)) + 48; // 0-9
42 |                 }
43 |                 FPUTS(c);
44 | 
45 |             }
46 |             add_delimiter = 1;
47 |         }
48 |         FPUTS("\n");
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/_queue.c:
--------------------------------------------------------------------------------
 1 | #include "csv.h"
 2 | #include "util.h"
 3 | #include "queue.h"
 4 | 
 5 | int main(int argc, char **argv) {
 6 |     SIGPIPE_HANDLER();
 7 |     CSV_INIT();
 8 |     ASSERT(argc == 2, "argc: %d != 2\n", argc);
 9 |     int capacity = atoi(argv[1]);
10 |     queue_t *q = queue_init(capacity);
11 |     while (1) {
12 |         CSV_READ_LINE(stdin);
13 |         if (csv_stop)
14 |             break;
15 |         ASSERT(0 == csv_max, "csv_max: %d != 0\n", csv_max);
16 |         u8 *action = csv_columns[0];
17 |         i32 size = csv_sizes[0];
18 |         u8 *val;
19 |         if (strncmp(action, "get", 3) == 0) {
20 |             val = queue_get(q);
21 |             if (val) {
22 |                 printf("%s\n", val);
23 |                 free(val);
24 |             } else {
25 |                 printf("empty\n");
26 |             }
27 |         } else if (strncmp(action, "put", 3) == 0) {
28 |             action += 4; // action = "put VALUE"
29 |             MALLOC(val, size - 4);
30 |             memset(val, 0, size - 4);
31 |             strncpy(val, action, size - 4);
32 |             if (queue_put(q, val)) {
33 |                 printf("full\n");
34 |             }
35 |         }
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/bcat.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "argh.h"
 3 | #include "load.h"
 4 | #include "write_simple.h"
 5 | 
 6 | #define DESCRIPTION "cat some bsv files to csv\n\n"
 7 | #define USAGE "bcat [-l|--lz4] [-p|--prefix] [-h N|--head N] FILE1 ... FILEN\n\n"
 8 | #define EXAMPLE                                     \
 9 |     ">> for char in a a b b c c; do\n"              \
10 |     "     echo $char | bsv >> /tmp/$char\n"         \
11 |     "   done\n\n"                                   \
12 |     ">> bcat --head 1 --prefix /tmp/{a,b,c}\n"      \
13 |     "/tmp/a:a\n"                                    \
14 |     "/tmp/b:b\n"                                    \
15 |     "/tmp/c:c\n"
16 | 
17 | int main(int argc, char **argv) {
18 |     // setup bsv
19 |     SETUP();
20 | 
21 |     // setup state
22 |     i32 ran = 0;
23 |     i64 line;
24 | 
25 |     // parse args
26 |     bool prefix = false;
27 |     bool lz4 = false;
28 |     i64 head = 0;
29 |     ARGH_PARSE {
30 |         ARGH_NEXT();
31 |         if      ARGH_BOOL("-p", "--prefix") { prefix = true; }
32 |         else if ARGH_BOOL("-l", "--lz4")    { lz4 = true; }
33 |         else if ARGH_FLAG("-h", "--head")   { ASSERT(isdigits(ARGH_VAL()), "fatal: should have been `--head INT`, not `--head %s`\n", ARGH_VAL());
34 |                                               head = atol(ARGH_VAL());}
35 |     }
36 | 
37 |     // setup input
38 |     ASSERT(ARGH_ARGC > 0, "usage: %s", USAGE);
39 |     FILE *files[ARGH_ARGC];
40 |     for (i32 i = 0; i < ARGH_ARGC; i++)
41 |         FOPEN(files[i], ARGH_ARGV[i], "rb");
42 |     readbuf_t rbuf = rbuf_init(files, ARGH_ARGC, lz4);
43 |     row_t row;
44 | 
45 |     // setup output
46 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1);
47 | 
48 |     // process input row by row
49 |     for (i32 i = 0; i < ARGH_ARGC; i++) {
50 |         line = 0;
51 |         while (1) {
52 |             line++;
53 |             load_next(&rbuf, &row, i);
54 |             if (row.stop)
55 |                 break;
56 |             if (head != 0 && line > head)
57 |                 break;
58 |             if (prefix) {
59 |                 write_bytes(&wbuf, ARGH_ARGV[i], strlen(ARGH_ARGV[i]), 0);
60 |                 write_bytes(&wbuf, ":", 1, 0);
61 |             }
62 |             for (i32 j = 0; j <= row.max; j++) {
63 |                 write_bytes(&wbuf, row.columns[j], row.sizes[j], 0);
64 |                 if (j != row.max)
65 |                     write_bytes(&wbuf, ",", 1, 0);
66 |             }
67 |             write_bytes(&wbuf, "\n", 1, 0);
68 |             ran = 1;
69 |         }
70 |     }
71 |     if (ran == 0)
72 |         write_bytes(&wbuf, "\n", 1, 0);
73 |     write_flush(&wbuf, 0);
74 | }
75 | 


--------------------------------------------------------------------------------
/src/bcombine.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "prepend a new column by combining values from existing columns\n\n"
 6 | #define USAGE "... | bcombine COL1,...,COLN\n\n"
 7 | #define EXAMPLE ">> echo a,b,c | bsv | bcombine 3,2 | csv\nb:a,a,b,c\n"
 8 | 
 9 | #define PARSE_ARGV()                                                                                                            \
10 |     do {                                                                                                                        \
11 |         ASSERT(argc == 2, "usage: %s", USAGE);                                                                                  \
12 |         char *f;                                                                                                                \
13 |         char *fs = (char*)argv[1];                                                                                              \
14 |         while ((f = strsep(&fs, ","))) {                                                                                        \
15 |             index = atoi(f);                                                                                                    \
16 |             indices[num_fields++] = index - 1;                                                                                  \
17 |             ASSERT(index <= MAX_COLUMNS, "fatal: cannot select indices above %d, tried to select: %d\n", MAX_COLUMNS, index);   \
18 |             ASSERT(index > 0, "fatal: indices must be gte 0, got: %d", index);                                                  \
19 |         }                                                                                                                       \
20 |         ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d indices\n", MAX_COLUMNS);                          \
21 |     } while (0)
22 | 
23 | int main(int argc, char **argv) {
24 | 
25 |     // setup bsv
26 |     SETUP();
27 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
28 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
29 | 
30 |     // setup state
31 |     i32 num_fields = 0;
32 |     i32 indices[MAX_COLUMNS];
33 |     i32 index;
34 |     PARSE_ARGV();
35 |     row_t row;
36 |     row_t new;
37 |     u8 *buffer;
38 |     MALLOC(buffer, BUFFER_SIZE);
39 |     i32 size;
40 | 
41 |     // process input row by row
42 |     while (1) {
43 |         load_next(&rbuf, &row, 0);
44 |         if (row.stop)
45 |             break;
46 |         for (i32 i = 0; i <= row.max; i++) {
47 |             new.sizes[i + 1] = row.sizes[i];
48 |             new.columns[i + 1] = row.columns[i];
49 |         }
50 |         size = 0;
51 |         for (i32 i = 0; i < num_fields; i++) {
52 |             index = indices[i];
53 |             ASSERT(index <= row.max, "fatal: line with %d columns, needed %d\n", row.max + 1, index + 1);
54 |             ASSERT(size + row.sizes[size] < BUFFER_SIZE, "fatal: bcombine buffer overflow\n");
55 |             memcpy(buffer + size, row.columns[index], row.sizes[size]);
56 |             size += row.sizes[size];
57 |             if (i < num_fields - 1) {
58 |                 buffer[size] = ':';
59 |                 size++;
60 |             }
61 |         }
62 |         new.columns[0] = buffer;
63 |         new.sizes[0] = size;
64 |         new.max = row.max + 1;
65 |         dump(&wbuf, &new, 0);
66 |     }
67 |     dump_flush(&wbuf, 0);
68 | }
69 | 


--------------------------------------------------------------------------------
/src/bcounteach.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "count as i64 each contiguous identical row by the first column\n\n"
 6 | #define USAGE "... | bcounteach\n\n"
 7 | #define EXAMPLE "echo '\na\na\nb\nb\nb\na\n' | bsv | bcounteach | bschema *,i64:a | csv\na,2\nb,3\na,1\n"
 8 | 
 9 | #define DUMP_COUNT()                                                    \
10 |     do {                                                                \
11 |         if (size > 0) {                                                 \
12 |             new.columns[0] = buffer;                                    \
13 |             new.sizes[0] = size;                                        \
14 |             new.columns[1] = &count;                                    \
15 |             new.sizes[1] = sizeof(i64);                                 \
16 |             new.max = 1;                                                \
17 |             dump(&wbuf, &new, 0);                                       \
18 |         }                                                               \
19 |     } while(0)
20 | 
21 | int main(int argc, char **argv) {
22 | 
23 |     // setup bsv
24 |     SETUP();
25 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
26 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
27 | 
28 |     // setup state
29 |     i64 count = 0;
30 |     i32 size = 0;
31 |     u8 *buffer;
32 |     row_t row;
33 |     row_t new;
34 |     MALLOC(buffer, BUFFER_SIZE);
35 | 
36 |     // process input row by row
37 |     while (1) {
38 |         load_next(&rbuf, &row, 0);
39 |         if (row.stop)
40 |             break;
41 |         count++;
42 |         if (compare_str(buffer, row.columns[0]) != 0) {
43 |             DUMP_COUNT();
44 |             memcpy(buffer, row.columns[0], row.sizes[0] + 1); // +1 for the trailing \0
45 |             size = row.sizes[0];
46 |             count = 0;
47 |         }
48 |     }
49 | 
50 |     // flush last value
51 |     count += 1;
52 |     DUMP_COUNT();
53 |     dump_flush(&wbuf, 0);
54 | }
55 | 


--------------------------------------------------------------------------------
/src/bcounteach_hash.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | #include "map.h"
 5 | 
 6 | #define DESCRIPTION "count as i64 by hash of the first column\n\n"
 7 | #define USAGE "... | bcounteach-hash\n\n"
 8 | #define EXAMPLE "echo '\na\na\nb\nb\nb\na\n' | bsv | bcounteach-hash | bschema *,i64:a | bsort | csv\na,3\nb,3\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
15 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
16 | 
17 |     // setup state
18 |     row_t row;
19 | 
20 |     MAP_INIT(counts, i64, 1<<16);
21 |     MAP_ALLOC(counts, i64);
22 | 
23 |     while (1) {
24 |         load_next(&rbuf, &row, 0);
25 |         if (row.stop) {
26 |             break;
27 |         }
28 |         MAP_SET_INDEX(counts, row.columns[0], row.sizes[0], i64);
29 |         MAP_VALUE(counts)++;
30 |     }
31 | 
32 |     for (i32 i = 0; i < MAP_SIZE(counts); i++) {
33 |         if (MAP_KEYS(counts)[i] != NULL) {
34 |             row.max = 1;
35 |             row.columns[0] = MAP_KEYS(counts)[i];
36 |             row.sizes[0] = MAP_SIZES(counts)[i];
37 |             row.columns[1] = &MAP_VALUES(counts)[i];
38 |             row.sizes[1] = sizeof(i64);
39 |             dump(&wbuf, &row, 0);
40 |         }
41 |     }
42 |     dump_flush(&wbuf, 0);
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/bcountrows.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "count rows as i64\n\n"
 6 | #define USAGE "... | bcountrows\n\n"
 7 | #define EXAMPLE ">> echo '\n1\n2\n3\n4\n' | bsv | bcountrows | csv\n4\n\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     i64 count = 0;
18 |     row_t row;
19 | 
20 |     // process input row by row
21 |     while (1) {
22 |         load_next(&rbuf, &row, 0);
23 |         if (row.stop)
24 |             break;
25 |         count++;
26 |     }
27 | 
28 |     // output value
29 |     row.max = 0;
30 |     row.columns[0] = &count;
31 |     row.sizes[0] = sizeof(i64);
32 |     dump(&wbuf, &row, 0);
33 |     dump_flush(&wbuf, 0);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/bcut.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "select some columns\n\n"
 6 | #define USAGE "... | bcut COL1,...,COLN\n\n"
 7 | #define EXAMPLE ">> echo a,b,c | bsv | bcut 3,3,3,2,2,1 | csv\nc,c,c,b,b,a\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     i32 num_fields = 0;
18 |     i32 indices[MAX_COLUMNS];
19 |     i32 index;
20 |     ASSERT(argc == 2, "usage: %s", USAGE);
21 |     char *f;
22 |     char *fs = (char*)argv[1];
23 |     while ((f = strsep(&fs, ","))) {
24 |         index = atoi(f);
25 |         indices[num_fields++] = index - 1;
26 |         ASSERT(index <= MAX_COLUMNS, "fatal: cannot select indices above %d, tried to select: %d\n", MAX_COLUMNS, index);
27 |         ASSERT(index > 0, "fatal: indices must be gte 0, got: %d", index);
28 |     }
29 |     ASSERT(num_fields <= MAX_COLUMNS, "fatal: cannot select more than %d indices\n", MAX_COLUMNS);
30 |     row_t row;
31 |     row_t new;
32 | 
33 |     // process input row by row
34 |     while (1) {
35 |         load_next(&rbuf, &row, 0);
36 |         if (row.stop)
37 |             break;
38 |         for (i32 i = 0; i < num_fields; i++) {
39 |             index = indices[i];
40 |             ASSERT(index <= row.max, "fatal: line with %d columns, needed %d\n", row.max + 1, index + 1);
41 |             new.columns[i] = row.columns[index];
42 |             new.sizes[i] = row.sizes[index];
43 |         }
44 |         new.max = num_fields - 1;
45 |         dump(&wbuf, &new, 0);
46 |     }
47 |     dump_flush(&wbuf, 0);
48 | }
49 | 


--------------------------------------------------------------------------------
/src/bdedupe.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "dedupe identical contiguous rows by the first column, keeping the first\n\n"
 6 | #define USAGE "... | bdedupe\n\n"
 7 | #define EXAMPLE ">> echo '\na\na\nb\nb\na\na\n' | bsv | bdedupe | csv\na\nb\na\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     u8 *buffer;
18 |     MALLOC(buffer, BUFFER_SIZE);
19 |     memset(buffer, 0, BUFFER_SIZE);
20 |     row_t row;
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         load_next(&rbuf, &row, 0);
25 |         if (row.stop)
26 |             break;
27 |         if (compare_str(buffer, row.columns[0]) != 0) {
28 |             dump(&wbuf, &row, 0);
29 |             memcpy(buffer, row.columns[0], row.sizes[0] + 1); // +1 for the trailing \0
30 |         }
31 |     }
32 |     dump_flush(&wbuf, 0);
33 | }
34 | 


--------------------------------------------------------------------------------
/src/bdedupe_hash.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | #include "map.h"
 5 | 
 6 | #define DESCRIPTION "dedupe rows by hash of the first column, keeping the first\n\n"
 7 | #define USAGE "... | bdedupe-hash\n\n"
 8 | #define EXAMPLE ">> echo '\na\na\nb\nb\na\na\n' | bsv | bdedupe-hash | csv\na\nb\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
15 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
16 | 
17 |     // setup state
18 |     row_t row;
19 |     MAP_INIT(dupes, u8, 1<<16);
20 |     MAP_ALLOC(dupes, u8);
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         load_next(&rbuf, &row, 0);
25 |         if (row.stop)
26 |             break;
27 |         MAP_SET_INDEX(dupes, row.columns[0], row.sizes[0], u8);
28 |         if (MAP_VALUE(dupes) == 0) {
29 |             MAP_VALUE(dupes) = 1;
30 |             dump(&wbuf, &row, 0);
31 |         }
32 |     }
33 |     dump_flush(&wbuf, 0);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/bdropuntil.c:
--------------------------------------------------------------------------------
 1 | #include "read_ahead.h"
 2 | #include "util.h"
 3 | #include "load.h"
 4 | #include "dump.h"
 5 | 
 6 | #define DESCRIPTION "for sorted input, drop until the first column is gte to VALUE\n\n"
 7 | #define USAGE "... | bdropuntil VALUE [TYPE]\n\n"
 8 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | bdropuntil c | csv\nc\nd\n\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
15 |     readaheadbuf_t rabuf = rabuf_init(1);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // setup state
19 |     bool done_skipping = false;
20 |     bool matched = false;
21 |     i32 cmp;
22 |     row_t row;
23 |     ASSERT(argc >= 2, "usage: %s", USAGE);
24 |     i64 val_i64;
25 |     i32 val_i32;
26 |     i16 val_i16;
27 |     u64 val_u64;
28 |     u32 val_u32;
29 |     u16 val_u16;
30 |     f64 val_f64;
31 |     f32 val_f32;
32 |     void *val;
33 |     i32 value_type;
34 |     if (argc == 2) {
35 |         val = argv[1];
36 |         value_type = STR;
37 |     } else {
38 |         ASSERT(argc == 3, "usage: %s", USAGE);
39 |         if      (strcmp(argv[2], "i64") == 0) { value_type = I64; val_i64 = atol(argv[1]); val = &val_i64; }
40 |         else if (strcmp(argv[2], "i32") == 0) { value_type = I32; val_i32 = atol(argv[1]); val = &val_i32; }
41 |         else if (strcmp(argv[2], "i16") == 0) { value_type = I16; val_i16 = atol(argv[1]); val = &val_i16; }
42 |         else if (strcmp(argv[2], "u64") == 0) { value_type = U64; val_u64 = atol(argv[1]); val = &val_u64; }
43 |         else if (strcmp(argv[2], "u32") == 0) { value_type = U32; val_u32 = atol(argv[1]); val = &val_u32; }
44 |         else if (strcmp(argv[2], "u16") == 0) { value_type = U16; val_u16 = atol(argv[1]); val = &val_u16; }
45 |         else if (strcmp(argv[2], "f64") == 0) { value_type = F64; val_f64 = atof(argv[1]); val = &val_f64; }
46 |         else if (strcmp(argv[2], "f32") == 0) { value_type = F32; val_f32 = atof(argv[1]); val = &val_f32; }
47 |         else ASSERT(0, "fatal: bad type %s\n", argv[2]);
48 |     }
49 | 
50 |     // process input row by row
51 |     while (1) {
52 |         load_next(&rbuf, &row, 0);
53 |         if (row.stop) { // ----------------------------------------------- reached the last chunk and possibly need to backup to the previous chunk to find a match
54 |             if (done_skipping) { // -------------------------------------- already gone back to the previous chunk, time to stop
55 |                 break;
56 |             } else { // -------------------------------------------------- go back and check the entire last chunk for a match
57 |                 read_goto_last_chunk(&rbuf, &rabuf, 0);
58 |                 done_skipping = true;
59 |             }
60 |         } else { // ------------------------------------------------------ reading data chunk by chunk, checking the first row and the proceeding to the next chunk
61 |             ASSERT_SIZE(value_type, row.sizes[0]);
62 |             if (matched) { // -------------------------------------------- once a match is found dump every row
63 |                 dump(&wbuf, &row, 0);
64 |             } else { // -------------------------------------------------- check for a match
65 |                 cmp = compare(value_type, row.columns[0], val);
66 |                 if (done_skipping) { // ---------------------------------- since we are done skipping ahead by chunks, check every row for a match
67 |                     if (cmp >= 0) {
68 |                         dump(&wbuf, &row, 0);
69 |                         matched = true;
70 |                     }
71 |                 } else if (cmp < 0) { // --------------------------------- we aren't done skipping ahead, we want to keep skipping until we've gone too far
72 |                     read_goto_next_chunk(&rbuf, &rabuf, 0);
73 |                 } else { // ---------------------------------------------- we've gone too far, time to backup one chunk and start checking every row
74 |                     read_goto_last_chunk(&rbuf, &rabuf, 0);
75 |                     done_skipping = true;
76 |                 }
77 |             }
78 |         }
79 |     }
80 |     dump_flush(&wbuf, 0);
81 | }
82 | 


--------------------------------------------------------------------------------
/src/bhead.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "keep the first n rows\n\n"
 6 | #define USAGE "... | bhead N\n\n"
 7 | #define EXAMPLE ">> echo '\na\nb\nc\n' | bsv | btail 2 | csv\na\nb\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     ASSERT(argc == 2 && isdigits(argv[1]), "usage: %s", USAGE);
18 |     row_t row;
19 |     i64 max = atol(argv[1]);
20 |     i64 count = 0;
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         load_next(&rbuf, &row, 0);
25 |         if (row.stop || count++ >= max)
26 |             break;
27 |         dump(&wbuf, &row, 0);
28 |     }
29 |     dump_flush(&wbuf, 0);
30 | }
31 | 


--------------------------------------------------------------------------------
/src/blz4.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "lz4.h"
 3 | 
 4 | #define DESCRIPTION "compress bsv data\n\n"
 5 | #define USAGE "... | blz4 \n\n"
 6 | #define EXAMPLE ">> echo a,b,c | bsv | blz4 | blz4d | csv\na,b,c\n"
 7 | 
 8 | int main(int argc, char **argv) {
 9 | 
10 |     // setup bsv
11 |     SETUP();
12 | 
13 |     // setup state
14 |     i32 size;
15 |     i32 lz4_size;
16 |     u8 *buf;
17 |     MALLOC(buf, BUFFER_SIZE);
18 |     u8 *lz4_buf;
19 |     MALLOC(lz4_buf, BUFFER_SIZE_LZ4);
20 |     ASSERT(LZ4_compressBound(BUFFER_SIZE) <= BUFFER_SIZE_LZ4, "fatal: lz4 compress bound\n");
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         if (0 == fread_unlocked(&size, 1, sizeof(i32), stdin))
25 |             break;
26 |         FREAD(buf, size, stdin);
27 |         lz4_size = LZ4_compress_fast(buf, lz4_buf, size, BUFFER_SIZE_LZ4, LZ4_ACCELERATION);
28 |         FWRITE(&size, sizeof(i32), stdout);
29 |         FWRITE(&lz4_size, sizeof(i32), stdout);
30 |         FWRITE(lz4_buf, lz4_size, stdout);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/blz4d.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "lz4.h"
 3 | 
 4 | #define DESCRIPTION "decompress bsv data\n\n"
 5 | #define USAGE "... | blz4d \n\n"
 6 | #define EXAMPLE ">> echo a,b,c | bsv | blz4 | blz4d | csv\na,b,c\n"
 7 | 
 8 | int main(int argc, char **argv) {
 9 | 
10 |     // setup bsv
11 |     SETUP();
12 | 
13 |     // setup state
14 |     i32 size;
15 |     i32 lz4_size;
16 |     u8 *buf;
17 |     MALLOC(buf, BUFFER_SIZE);
18 |     u8 *lz4_buf;
19 |     MALLOC(lz4_buf, BUFFER_SIZE_LZ4);
20 |     ASSERT(LZ4_compressBound(BUFFER_SIZE) <= BUFFER_SIZE_LZ4, "fatal: lz4 compress bound\n");
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         if (0 == fread_unlocked(&size, 1, sizeof(i32), stdin))
25 |             break;
26 |         FREAD(&lz4_size, sizeof(i32), stdin);
27 |         FREAD(lz4_buf, lz4_size, stdin);
28 |         ASSERT(size == LZ4_decompress_safe(lz4_buf, buf, lz4_size, BUFFER_SIZE), "fatal: decompress size mismatch\n");
29 |         FWRITE(&size, sizeof(i32), stdout);
30 |         FWRITE(buf, size, stdout);
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/bpartition.c:
--------------------------------------------------------------------------------
  1 | #include "util.h"
  2 | #include "argh.h"
  3 | #include "load.h"
  4 | #include "dump.h"
  5 | #include "xxh3.h"
  6 | #include <errno.h>
  7 | #include <sys/stat.h>
  8 | #include <ctype.h>
  9 | 
 10 | #define SEED 0
 11 | 
 12 | #define DESCRIPTION "split into multiple files by consistent hash of the first column value\n\n"
 13 | #define USAGE "\n... | bpartition NUM_BUCKETS [PREFIX] [-l|--lz4]\n\n"
 14 | #define EXAMPLE ">> echo '\na\nb\nc\n' | bsv | bpartition 10 prefix\nprefix03\nprefix06\n"
 15 | 
 16 | int empty_file(char *path) {
 17 |     struct stat st;
 18 |     if (stat(path, &st) == 0)
 19 |         return st.st_size == 0;
 20 |     return -1;
 21 | }
 22 | 
 23 | int main(int argc, char **argv) {
 24 | 
 25 |     // setup bsv
 26 |     SETUP();
 27 | 
 28 |     // setup input
 29 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
 30 | 
 31 |     // setup state
 32 |     row_t row;
 33 |     u8 *prefix;
 34 |     u8 num_buckets_str[16];
 35 |     u8 path[1024];
 36 |     i32 empty;
 37 |     i32 num_buckets;
 38 |     u64 file_num;
 39 |     u64 hash;
 40 | 
 41 |     // parse args
 42 |     bool lz4 = false;
 43 |     ARGH_PARSE {
 44 |         ARGH_NEXT();
 45 |         if ARGH_BOOL("-l", "--lz4") { lz4 = true; }
 46 |     }
 47 |     ASSERT(ARGH_ARGC >= 1, "usage: %s", USAGE);
 48 |     ASSERT(strlen(ARGH_ARGV[0]) <= 8, "NUM_BUCKETS must be less than 1e8, got: %s\n", argv[1]);
 49 |     num_buckets = atoi(ARGH_ARGV[0]);
 50 |     ASSERT(num_buckets > 0, "NUM_BUCKETS must be positive, got: %d\n", num_buckets);
 51 |     if (ARGH_ARGC == 2) {
 52 |         prefix = ARGH_ARGV[1];
 53 |     } else {
 54 |         prefix = "";
 55 |     }
 56 | 
 57 |     // open output files
 58 |     FILE *files[num_buckets];
 59 |     SNPRINTF(num_buckets_str, sizeof(num_buckets_str), "%d", num_buckets);
 60 |     for (i32 i = 0; i < num_buckets; i++) {
 61 |         if (strlen(prefix) != 0)
 62 |             SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_buckets_str), i);
 63 |         else
 64 |             SNPRINTF(path, sizeof(path), "%0*d", strlen(num_buckets_str), i);
 65 |         FOPEN(files[i], path, "ab");
 66 |     }
 67 | 
 68 |     // setup output
 69 |     writebuf_t wbuf = wbuf_init(files, num_buckets, lz4);
 70 | 
 71 |     // for 1 bucket, pipe the data straight through
 72 |     if (num_buckets == 1) {
 73 |         i32 rbytes;
 74 |         i32 chunk_size;
 75 |         while (1) {
 76 |             rbytes = fread_unlocked(&chunk_size, 1, sizeof(i32), rbuf.files[0]);
 77 |             ASSERT(rbytes == 0 || rbytes == sizeof(i32), "fatal: bad bpartition chunk read %d\n", rbytes);
 78 |             if (rbytes != sizeof(i32))
 79 |                 break;
 80 |             FREAD(wbuf.buffer[0], chunk_size, rbuf.files[0]);
 81 |             wbuf.offset[0] = chunk_size;
 82 |             write_flush(&wbuf, 0);
 83 |         }
 84 | 
 85 |     // for more than 1 bucket, process input row by row
 86 |     } else {
 87 |         while (1) {
 88 |             load_next(&rbuf, &row, 0);
 89 |             if (row.stop)
 90 |                 break;
 91 |             hash = XXH3_64bits(row.columns[0], row.sizes[0]);
 92 |             file_num = hash % num_buckets;
 93 |             dump(&wbuf, &row, file_num);
 94 |         }
 95 |     }
 96 | 
 97 |     // flush and close
 98 |     for (i32 i = 0; i < num_buckets; i++) {
 99 |         dump_flush(&wbuf, i);
100 |         ASSERT(fclose(files[i]) != EOF, "fatal: failed to close files\n");
101 |     }
102 | 
103 |     // delete any empty output files
104 |     for (i32 i = 0; i < num_buckets; i++) {
105 |         if (strlen(prefix) != 0)
106 |             SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_buckets_str), i);
107 |         else
108 |             SNPRINTF(path, sizeof(path), "%0*d", strlen(num_buckets_str), i);
109 |         empty = empty_file(path);
110 |         if (empty == 1) {
111 |             ASSERT(remove(path) == 0, "fatal: failed to delete file: %s\n", path);
112 |         } else {
113 |             ASSERT(empty != -1, "fatal: failed to stat file: %s\n", path);
114 |             FPRINTF(stdout, "%s\n", path);
115 |         }
116 |     }
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/src/bquantile_merge.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "ddsketch.h"
 3 | #include "load.h"
 4 | #include "dump.h"
 5 | #include "argh.h"
 6 | 
 7 | #define DESCRIPTION "merge ddsketches and output quantile value pairs as f64\n\n"
 8 | #define USAGE "... | bquantile-merge QUANTILES \n\n"
 9 | #define EXAMPLE ">> seq 1 100 | bsv | bschema a:i64 | bquantile-sketch i64 | bquantile-merge .2,.5,.7 | bschema f64:a,f64:a | csv\n0.2,19.88667024086646\n0.5,49.90296094906742\n0.7,70.11183939140405\n"
10 | 
11 | int main(int argc, char **argv) {
12 | 
13 |     // setup bsv
14 |     SETUP();
15 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // parse args
19 |     ASSERT(argc == 2, "usage: %s", USAGE);
20 |     i32 num_quantiles = 0;
21 |     f64 quantiles[MAX_COLUMNS];
22 |     f64 quantile;
23 |     char *f;
24 |     char *fs = (char*)argv[1];
25 |     while ((f = strsep(&fs, ","))) {
26 |         ASSERT(isdigits_ordot(f), "fatal: bad arg\n");
27 |         quantile = atof(f);
28 |         ASSERT(quantile >= 0 && quantile <= 1, "fatal: bad arg\n");
29 |         quantiles[num_quantiles++] = quantile;
30 |     }
31 | 
32 |     // setup state
33 |     row_t row;
34 |     sketch_t *s = NULL;
35 |     sketch_t *o;
36 | 
37 |     // process input row by row
38 |     while (1) {
39 |         load_next(&rbuf, &row, 0);
40 |         if (row.stop)
41 |             break;
42 |         o = sketch_from_row(&row);
43 |         if (s) {
44 |             ASSERT(s->config->max_num_bins == o->config->max_num_bins, "fatal: must merge sketches with same config settings\n");
45 |             ASSERT(s->config->gamma        == o->config->gamma,        "fatal: must merge sketches with same config settings\n");
46 |             ASSERT(s->config->min_value    == o->config->min_value,    "fatal: must merge sketches with same config settings\n");
47 |             sketch_merge(s, o);
48 |         } else
49 |             s = o;
50 |     }
51 | 
52 |     // dump quantiles
53 |     f64 val;
54 |     for (i32 i = 0; i < num_quantiles; i++) {
55 |         row.max = 1;
56 |         row.columns[0] = &quantiles[i];
57 |         row.sizes[0] = sizeof(f64);
58 |         val = sketch_quantile(s, quantiles[i]);
59 |         row.columns[1] = &val;
60 |         row.sizes[1] = sizeof(f64);
61 |         dump(&wbuf, &row, 0);
62 |     }
63 |     dump_flush(&wbuf, 0);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/bquantile_sketch.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "ddsketch.h"
 3 | #include "load.h"
 4 | #include "dump.h"
 5 | #include "argh.h"
 6 | 
 7 | #define DESCRIPTION "collapse the first column into a single row ddsketch\n\n"
 8 | #define USAGE "... | bquantile-sketch TYPE [-a|--alpha] [-b|--max-bins] [-m|--min-value] \n\n"
 9 | #define EXAMPLE ">> seq 1 100 | bsv | bschema a:i64 | bquantile-sketch i64 | bquantile-merge .2,.5,.7 | bschema f64:a,f64:a | csv\n0.2,19.88667024086646\n0.5,49.90296094906742\n0.7,70.11183939140405\n"
10 | 
11 | int main(int argc, char **argv) {
12 | 
13 |     // setup bsv
14 |     SETUP();
15 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // parse args
19 |     f64 alpha = DEFAULT_ALPHA;
20 |     i32 max_num_bins = DEFAULT_MAX_NUM_BINS;
21 |     f64 min_value = DEFAULT_MIN_VALUE;
22 |     ARGH_PARSE {
23 |         ARGH_NEXT();
24 |         if      ARGH_FLAG("-a", "--alpha")     { alpha        = atof(ARGH_VAL()); ASSERT(isdigits_ordot(ARGH_VAL()), "fatal: bad arg\n"); }
25 |         else if ARGH_FLAG("-m", "--min-value") { min_value    = atof(ARGH_VAL()); ASSERT(isdigits_ordot(ARGH_VAL()), "fatal: bad arg\n"); }
26 |         else if ARGH_FLAG("-b", "--max-bins")  { max_num_bins = atoi(ARGH_VAL()); ASSERT(isdigits(ARGH_VAL()), "fatal: bad arg\n"); }
27 |     }
28 |     i32 value_type;
29 |     ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE);
30 |     if      (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = I64;
31 |     else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = I32;
32 |     else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = I16;
33 |     else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = U64;
34 |     else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = U32;
35 |     else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = U16;
36 |     else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = F64;
37 |     else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = F32;
38 |     else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]);
39 | 
40 |     // setup state
41 |     row_t row;
42 |     config_t *c = config_new(alpha, max_num_bins, min_value);
43 |     sketch_t *s = sketch_new(c);
44 | 
45 |     // process input row by row
46 |     while (1) {
47 |         load_next(&rbuf, &row, 0);
48 |         if (row.stop)
49 |             break;
50 |         ASSERT_SIZE(value_type, row.sizes[0]);
51 |         switch (value_type) {
52 |             case I64: sketch_add(s, (f64)*(i64*)(row.columns[0])); break;
53 |             case I32: sketch_add(s, (f64)*(i32*)(row.columns[0])); break;
54 |             case I16: sketch_add(s, (f64)*(i16*)(row.columns[0])); break;
55 |             case U64: sketch_add(s, (f64)*(u64*)(row.columns[0])); break;
56 |             case U32: sketch_add(s, (f64)*(u32*)(row.columns[0])); break;
57 |             case U16: sketch_add(s, (f64)*(u16*)(row.columns[0])); break;
58 |             case F64: sketch_add(s, (f64)*(f64*)(row.columns[0])); break;
59 |             case F32: sketch_add(s, (f64)*(f32*)(row.columns[0])); break;
60 |         }
61 |     }
62 | 
63 |     // dump sketch
64 |     sketch_to_row(&row, s);
65 |     dump(&wbuf, &row, 0);
66 |     dump_flush(&wbuf, 0);
67 | }
68 | 


--------------------------------------------------------------------------------
/src/bsort.c:
--------------------------------------------------------------------------------
 1 | #define READ_GROWING
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | #include "array.h"
 5 | #include "argh.h"
 6 | 
 7 | #define DESCRIPTION "timsort rows by the first column\n\n"
 8 | #define USAGE "... | bsort [-r|--reversed] [TYPE]\n\n"
 9 | #define EXAMPLE ">> echo '\n3\n2\n1\n' | bsv | bschema a:i64 | bsort i64 | bschema i64:a | csv\n1\n2\n3\n\n"
10 | 
11 | #define SORT_NAME row
12 | #define SORT_TYPE raw_row_t
13 | #define SORT_CMP(x, y) compare((x).meta, (x).buffer, (y).buffer)
14 | #include "sort.h"
15 | 
16 | int main(int argc, char **argv) {
17 | 
18 |     // setup bsv
19 |     SETUP();
20 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
21 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
22 | 
23 |     // setup state
24 |     row_t row;
25 |     raw_row_t raw_row;
26 |     ARRAY_INIT(array, raw_row_t);
27 | 
28 |     // parse args
29 |     bool reversed = false;
30 |     ARGH_PARSE {
31 |         ARGH_NEXT();
32 |         if ARGH_BOOL("-r", "--reversed") { reversed = true; }
33 |     }
34 | 
35 |     i32 value_type;
36 |     if (!ARGH_ARGC)
37 |         if (reversed)
38 |             value_type = R_STR;
39 |         else
40 |             value_type = STR;
41 |     else {
42 |         ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE);
43 |         if (reversed) {
44 |             if      (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = R_I64;
45 |             else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = R_I32;
46 |             else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = R_I16;
47 |             else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = R_U64;
48 |             else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = R_U32;
49 |             else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = R_U16;
50 |             else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = R_F64;
51 |             else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = R_F32;
52 |             else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]);
53 |         } else {
54 |             if      (strcmp(ARGH_ARGV[0], "i64") == 0) value_type = I64;
55 |             else if (strcmp(ARGH_ARGV[0], "i32") == 0) value_type = I32;
56 |             else if (strcmp(ARGH_ARGV[0], "i16") == 0) value_type = I16;
57 |             else if (strcmp(ARGH_ARGV[0], "u64") == 0) value_type = U64;
58 |             else if (strcmp(ARGH_ARGV[0], "u32") == 0) value_type = U32;
59 |             else if (strcmp(ARGH_ARGV[0], "u16") == 0) value_type = U16;
60 |             else if (strcmp(ARGH_ARGV[0], "f64") == 0) value_type = F64;
61 |             else if (strcmp(ARGH_ARGV[0], "f32") == 0) value_type = F32;
62 |             else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[0]);
63 |         }
64 |     }
65 | 
66 |     // read
67 |     while (1) {
68 |         load_next(&rbuf, &row, 0);
69 |         if (row.stop)
70 |             break;
71 |         ASSERT_SIZE(value_type, row.sizes[0]);
72 |         row_to_raw(&row, &raw_row);
73 |         raw_row.meta = value_type;
74 |         ARRAY_APPEND(array, raw_row, raw_row_t);
75 |     }
76 | 
77 |     // sort
78 |     row_tim_sort(array, array_size);
79 | 
80 |     // write
81 |     for (i32 i = 0; i < array_size; i++)
82 |         dump_raw(&wbuf, &array[i], 0);
83 |     dump_flush(&wbuf, 0);
84 | }
85 | 


--------------------------------------------------------------------------------
/src/bsplit.c:
--------------------------------------------------------------------------------
 1 | #include "read_ahead.h"
 2 | #include "util.h"
 3 | #include "load.h"
 4 | #include "write.h"
 5 | 
 6 | #define DESCRIPTION "split a stream into multiple files\n\n"
 7 | #define USAGE "... | bsplit PREFIX [chunks_per_file=1] \n\n"
 8 | #define EXAMPLE ">> echo -n a,b,c | bsv | bsplit prefix\nprefix_0000000000\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
15 |     readaheadbuf_t rabuf = rabuf_init(1);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // setup state
19 |     i32 i = 0;
20 |     i32 j = 0;
21 |     ASSERT(argc >= 2, "usage: %s", USAGE);
22 |     u8 *prefix = argv[1];
23 |     u8 filename[1024];
24 |     FILE *f = NULL;
25 |     i32 chunks_per_file = 1;
26 |     row_t row;
27 | 
28 |     // parse args
29 |     if (argc == 3)
30 |         chunks_per_file = atol(argv[2]);
31 | 
32 |     // process input row by row
33 |     while (1) {
34 | 
35 |         load_next(&rbuf, &row, 0);
36 |         if (row.stop)
37 |             break;
38 | 
39 |         // open and print next file if needed
40 |         if (f == NULL) {
41 |             memset(filename, 0, sizeof(filename));
42 |             SNPRINTF(filename, sizeof(filename), "%s_%010d", prefix, i++);
43 |             FOPEN(f, filename, "wb");
44 |             FPRINTF(stdout, "%s\n", filename);
45 |         }
46 | 
47 |         // write chunk
48 |         FWRITE(&rbuf.chunk_size[0], sizeof(i32), f);
49 |         FWRITE(rbuf.buffers[0], rbuf.chunk_size[0], f);
50 |         read_goto_next_chunk(&rbuf, &rabuf, 0);
51 | 
52 |         // close file if needed
53 |         if (++j % chunks_per_file == 0) {
54 |             ASSERT(fclose(f) != EOF, "fatal: failed to close files\n");
55 |             f = NULL;
56 |         }
57 | 
58 |     }
59 | 
60 |     // close last file if needed
61 |     if (f)
62 |         ASSERT(fclose(f) != EOF, "fatal: failed to close files\n");
63 | 
64 | }
65 | 


--------------------------------------------------------------------------------
/src/bsum.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "sum the first column\n\n"
 6 | #define USAGE "... | bsum TYPE \n\n"
 7 | #define EXAMPLE ">> echo '\n1\n2\n3\n4\n' | bsv | bschema a:i64 | bsum i64 | bschema i64:a | csv\n10\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     i64 sum_i64 = 0;
18 |     i32 sum_i32 = 0;
19 |     i16 sum_i16 = 0;
20 |     u64 sum_u64 = 0;
21 |     u32 sum_u32 = 0;
22 |     u16 sum_u16 = 0;
23 |     f64 sum_f64 = 0;
24 |     f32 sum_f32 = 0;
25 |     i32 value_type;
26 |     row_t row;
27 | 
28 |     // parse args
29 |     ASSERT(argc == 2, "usage: %s", USAGE);
30 |     if      (strcmp(argv[1], "i64") == 0) value_type = I64;
31 |     else if (strcmp(argv[1], "i32") == 0) value_type = I32;
32 |     else if (strcmp(argv[1], "i16") == 0) value_type = I16;
33 |     else if (strcmp(argv[1], "u64") == 0) value_type = U64;
34 |     else if (strcmp(argv[1], "u32") == 0) value_type = U32;
35 |     else if (strcmp(argv[1], "u16") == 0) value_type = U16;
36 |     else if (strcmp(argv[1], "f64") == 0) value_type = F64;
37 |     else if (strcmp(argv[1], "f32") == 0) value_type = F32;
38 |     else ASSERT(0, "fatal: bad type %s\n", argv[1]);
39 | 
40 |     // process input row by row
41 |     while (1) {
42 |         load_next(&rbuf, &row, 0);
43 |         if (row.stop)
44 |             break;
45 |         ASSERT_SIZE(value_type, row.sizes[0]);
46 |         switch (value_type) {
47 |             case I64: sum_i64 += *(i64*)(row.columns[0]); break;
48 |             case I32: sum_i32 += *(i32*)(row.columns[0]); break;
49 |             case I16: sum_i16 += *(i16*)(row.columns[0]); break;
50 |             case U64: sum_u64 += *(u64*)(row.columns[0]); break;
51 |             case U32: sum_u32 += *(u32*)(row.columns[0]); break;
52 |             case U16: sum_u16 += *(u16*)(row.columns[0]); break;
53 |             case F64: sum_f64 += *(f64*)(row.columns[0]); break;
54 |             case F32: sum_f32 += *(f32*)(row.columns[0]); break;
55 |         }
56 |     }
57 | 
58 |     // output sum
59 |     row.max = 0;
60 |     switch (value_type) {
61 |         case I64: row.columns[0] = &sum_i64; row.sizes[0] = sizeof(i64); break;
62 |         case I32: row.columns[0] = &sum_i32; row.sizes[0] = sizeof(i32); break;
63 |         case I16: row.columns[0] = &sum_i16; row.sizes[0] = sizeof(i16); break;
64 |         case U64: row.columns[0] = &sum_u64; row.sizes[0] = sizeof(u64); break;
65 |         case U32: row.columns[0] = &sum_u32; row.sizes[0] = sizeof(u32); break;
66 |         case U16: row.columns[0] = &sum_u16; row.sizes[0] = sizeof(u16); break;
67 |         case F64: row.columns[0] = &sum_f64; row.sizes[0] = sizeof(f64); break;
68 |         case F32: row.columns[0] = &sum_f32; row.sizes[0] = sizeof(f32); break;
69 |     }
70 |     dump(&wbuf, &row, 0);
71 |     dump_flush(&wbuf, 0);
72 | }
73 | 


--------------------------------------------------------------------------------
/src/bsv.c:
--------------------------------------------------------------------------------
 1 | #include "csv.h"
 2 | #include "dump.h"
 3 | 
 4 | #define DESCRIPTION "convert csv to bsv\n\n"
 5 | #define USAGE "... | bsv\n\n"
 6 | #define EXAMPLE ">> echo a,b,c | bsv | bcut 3,2,1 | csv\nc,b,a\n"
 7 | 
 8 | int main(int argc, char **argv) {
 9 | 
10 |     // setup bsv
11 |     SETUP();
12 | 
13 |     // setup input
14 |     CSV_INIT();
15 | 
16 |     // setup output
17 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
18 | 
19 |     // setup state
20 |     row_t row;
21 | 
22 |     // process input row by row
23 |     while (1) {
24 |         CSV_READ_LINE(stdin);
25 |         if (csv_stop)
26 |             break;
27 |         if (csv_max > 0 || csv_sizes[0] > 0) {
28 |             row.max = csv_max;
29 |             for (i32 i = 0; i <= row.max; i++) {
30 |                 row.columns[i] = csv_columns[i];
31 |                 row.sizes[i] = csv_sizes[i];
32 |             }
33 |             dump(&wbuf, &row, 0);
34 |         }
35 |     }
36 |     dump_flush(&wbuf, 0);
37 | }
38 | 


--------------------------------------------------------------------------------
/src/btake.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "load.h"
 3 | #include "dump.h"
 4 | 
 5 | #define DESCRIPTION "take while the first column is VALUE\n\n"
 6 | #define USAGE "... | btake VALUE\n\n"
 7 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | bdropntil c | btake c | csv\nc\n\n"
 8 | 
 9 | int main(int argc, char **argv) {
10 | 
11 |     // setup bsv
12 |     SETUP();
13 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
14 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
15 | 
16 |     // setup state
17 |     row_t row;
18 |     u8 *val = argv[1];
19 | 
20 |     // process input row by row
21 |     while (1) {
22 |         load_next(&rbuf, &row, 0);
23 |         if (row.stop)
24 |             break;
25 |         if (compare_str(row.columns[0], val) != 0)
26 |             break;
27 |         dump(&wbuf, &row, 0);
28 |     }
29 |     dump_flush(&wbuf, 0);
30 | }
31 | 


--------------------------------------------------------------------------------
/src/btakeuntil.c:
--------------------------------------------------------------------------------
 1 | #include "read_ahead.h"
 2 | #include "util.h"
 3 | #include "load.h"
 4 | #include "dump.h"
 5 | 
 6 | #define DESCRIPTION "for sorted input, take until the first column is gte to VALUE\n\n"
 7 | #define USAGE "... | btakeuntil VALUE [TYPE]\n\n"
 8 | #define EXAMPLE ">> echo '\na\nb\nc\nd\n' | bsv | btakeuntil c | csv\na\nb\n\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
15 |     readaheadbuf_t rabuf = rabuf_init(1);
16 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
17 | 
18 |     // setup state
19 |     bool done_skipping = false;
20 |     bool matched = false;
21 |     i32 cmp;
22 |     row_t row;
23 |     ASSERT(argc >= 2, "usage: %s", USAGE);
24 |     i64 val_i64;
25 |     i32 val_i32;
26 |     i16 val_i16;
27 |     u64 val_u64;
28 |     u32 val_u32;
29 |     u16 val_u16;
30 |     f64 val_f64;
31 |     f32 val_f32;
32 |     void *val;
33 |     i32 value_type;
34 |     if (argc == 2) {
35 |         val = argv[1];
36 |         value_type = STR;
37 |     } else {
38 |         ASSERT(argc == 3, "usage: %s", USAGE);
39 |         if      (strcmp(argv[2], "i64") == 0) { value_type = I64; val_i64 = atol(argv[1]); val = &val_i64; }
40 |         else if (strcmp(argv[2], "i32") == 0) { value_type = I32; val_i32 = atol(argv[1]); val = &val_i32; }
41 |         else if (strcmp(argv[2], "i16") == 0) { value_type = I16; val_i16 = atol(argv[1]); val = &val_i16; }
42 |         else if (strcmp(argv[2], "u64") == 0) { value_type = U64; val_u64 = atol(argv[1]); val = &val_u64; }
43 |         else if (strcmp(argv[2], "u32") == 0) { value_type = U32; val_u32 = atol(argv[1]); val = &val_u32; }
44 |         else if (strcmp(argv[2], "u16") == 0) { value_type = U16; val_u16 = atol(argv[1]); val = &val_u16; }
45 |         else if (strcmp(argv[2], "f64") == 0) { value_type = F64; val_f64 = atof(argv[1]); val = &val_f64; }
46 |         else if (strcmp(argv[2], "f32") == 0) { value_type = F32; val_f32 = atof(argv[1]); val = &val_f32; }
47 |         else ASSERT(0, "fatal: bad type %s\n", argv[2]);
48 |     }
49 | 
50 |     // process input row by row
51 |     while (1) {
52 |         load_next(&rbuf, &row, 0);
53 |         if (row.stop) { // ----------------------------------------------- reached the last chunk and possibly need to backup to the previous chunk to find a match
54 |             if (done_skipping) { // -------------------------------------- already gone back to the previous chunk, time to stop
55 |                 break;
56 |             } else { // -------------------------------------------------- go back and check the entire last chunk for a match
57 |                 read_goto_last_chunk(&rbuf, &rabuf, 0);
58 |                 done_skipping = true;
59 |             }
60 |         } else { // ------------------------------------------------------ reading data chunk by chunk, checking the first row and the proceeding to the next chunk
61 |             ASSERT_SIZE(value_type, row.sizes[0]);
62 |             cmp = compare(value_type, row.columns[0], val); // ------------------- check for a match
63 |             if (done_skipping) { // -------------------------------------- since we are done skipping ahead by chunks, check every row for a match
64 |                 if (cmp >= 0) // ----------------------------------------- found a match, time to stop
65 |                     break;
66 |                 dump(&wbuf, &row, 0); // --------------------------------- otherwise dump the row
67 |             } else if (cmp < 0) { // ------------------------------------- we aren't done skipping ahead, we want to keep skipping until we've gone too far
68 |                 if (rabuf.has_nexted) { // ------------------------------- write the entire last chunk since we know all of it's rows are not a match
69 |                     memcpy(wbuf.buffer[0], rabuf.last_buffers[0], rabuf.last_chunk_size[0]);
70 |                     wbuf.offset[0] = rabuf.last_chunk_size[0];
71 |                     write_flush(&wbuf, 0);
72 |                 }
73 |                 read_goto_next_chunk(&rbuf, &rabuf, 0);
74 |             } else { // -------------------------------------------------- we've gone too far, time to backup one chunk and start checking every row
75 |                 read_goto_last_chunk(&rbuf, &rabuf, 0);
76 |                 done_skipping = true;
77 |             }
78 |         }
79 |     }
80 |     dump_flush(&wbuf, 0);
81 | }
82 | 


--------------------------------------------------------------------------------
/src/btopn.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "argh.h"
 3 | #include "array.h"
 4 | #include "load.h"
 5 | #include "dump.h"
 6 | 
 7 | #define HEAP_COMPARE(meta, x, y) compare(meta, ((raw_row_t*)x)->buffer, ((raw_row_t*)y)->buffer) > 0
 8 | #include "heap.h"
 9 | 
10 | #define DESCRIPTION "accumulate the top n rows in a heap by first column value\n\n"
11 | #define USAGE "... | btopn N [TYPE] [-r|--reversed]\n\n"
12 | #define EXAMPLE ">> echo '\n1\n3\n2\n' | bsv | bschema a:i64 | btopn 2 i64 | bschema i64:a | csv\n3\n2\n\n"
13 | 
14 | int main(int argc, char **argv) {
15 | 
16 |     // setup bsv
17 |     SETUP();
18 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
19 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
20 | 
21 |     // parse args
22 |     bool reversed = false;
23 |     ARGH_PARSE {
24 |         ARGH_NEXT();
25 |         if ARGH_BOOL("-r", "--reversed") { reversed = true; }
26 |     }
27 |     ASSERT(ARGH_ARGC >= 1, "usage: %s", USAGE);
28 |     ASSERT(isdigits(ARGH_ARGV[0]), "usage: %s", USAGE);
29 |     i32 top_n = atol(ARGH_ARGV[0]);
30 |     i32 value_type;
31 |     if (ARGH_ARGC == 1)
32 |         if (reversed)
33 |             value_type = R_STR;
34 |         else
35 |             value_type = STR;
36 |     else {
37 |         ASSERT(ARGH_ARGC == 2, "usage: %s", USAGE);
38 |         if (reversed) {
39 |             if      (strcmp(ARGH_ARGV[1], "i64") == 0) value_type = R_I64;
40 |             else if (strcmp(ARGH_ARGV[1], "i32") == 0) value_type = R_I32;
41 |             else if (strcmp(ARGH_ARGV[1], "i16") == 0) value_type = R_I16;
42 |             else if (strcmp(ARGH_ARGV[1], "u64") == 0) value_type = R_U64;
43 |             else if (strcmp(ARGH_ARGV[1], "u32") == 0) value_type = R_U32;
44 |             else if (strcmp(ARGH_ARGV[1], "u16") == 0) value_type = R_U16;
45 |             else if (strcmp(ARGH_ARGV[1], "f64") == 0) value_type = R_F64;
46 |             else if (strcmp(ARGH_ARGV[1], "f32") == 0) value_type = R_F32;
47 |             else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[1]);
48 |         } else {
49 |             if      (strcmp(ARGH_ARGV[1], "i64") == 0) value_type = I64;
50 |             else if (strcmp(ARGH_ARGV[1], "i32") == 0) value_type = I32;
51 |             else if (strcmp(ARGH_ARGV[1], "i16") == 0) value_type = I16;
52 |             else if (strcmp(ARGH_ARGV[1], "u64") == 0) value_type = U64;
53 |             else if (strcmp(ARGH_ARGV[1], "u32") == 0) value_type = U32;
54 |             else if (strcmp(ARGH_ARGV[1], "u16") == 0) value_type = U16;
55 |             else if (strcmp(ARGH_ARGV[1], "f64") == 0) value_type = F64;
56 |             else if (strcmp(ARGH_ARGV[1], "f32") == 0) value_type = F32;
57 |             else ASSERT(0, "fatal: bad type %s\n", ARGH_ARGV[1]);
58 |         }
59 |     }
60 | 
61 |     // setup state
62 |     row_t row;
63 |     raw_row_t *raw_row;
64 |     heap_t h = {0};
65 |     h.meta = value_type;
66 | 
67 |     // process input row by row
68 |     while (1) {
69 |         load_next(&rbuf, &row, 0);
70 |         if (row.stop)
71 |             break;
72 |         ASSERT_SIZE(value_type, row.sizes[0]);
73 |         MALLOC(raw_row, sizeof(raw_row_t));
74 |         row_to_raw_malloc(&row, raw_row);
75 |         heap_insert(&h, raw_row);
76 |         if (h.size > top_n * 128) // amortize truncation cost, 128 is abitrary
77 |             heap_truncate(&h, top_n);
78 |     }
79 | 
80 |     // dump output
81 |     i32 i = top_n;
82 |     while (i--) {
83 |         if (!h.size)
84 |             break;
85 |         raw_row = (raw_row_t*)h.nodes[0];
86 |         dump_raw(&wbuf, raw_row, 0);
87 |         heap_delete(&h);
88 |     }
89 |     dump_flush(&wbuf, 0);
90 | 
91 | }
92 | 


--------------------------------------------------------------------------------
/src/bunzip.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "argh.h"
 3 | #include "load.h"
 4 | #include "dump.h"
 5 | 
 6 | #define DESCRIPTION "split a multi column input into single column outputs\n\n"
 7 | #define USAGE "... | bunzip PREFIX [-l|--lz4]\n\n"
 8 | #define EXAMPLE ">> echo '\na,b,c\n1,2,3\n' | bsv | bunzip col && echo col_1 col_3 | bzip | csv\na,c\n1,3\n"
 9 | 
10 | int main(int argc, char **argv) {
11 | 
12 |     // setup bsv
13 |     SETUP();
14 | 
15 |     // setup input
16 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
17 | 
18 |     // setup state
19 |     u8 num_columns_str[16];
20 |     u8 path[1024];
21 |     u8 *prefix;
22 |     row_t row;
23 |     row_t new;
24 |     new.max = 0;
25 | 
26 |     // parse args
27 |     bool lz4 = false;
28 |     ARGH_PARSE {
29 |         ARGH_NEXT();
30 |         if ARGH_BOOL("-l", "--lz4") { lz4 = true; }
31 |     }
32 |     ASSERT(ARGH_ARGC == 1, "usage: %s", USAGE);
33 |     prefix = ARGH_ARGV[0];
34 | 
35 |     // read first row to find the number of columns
36 |     load_next(&rbuf, &row, 0);
37 |     if (row.stop)
38 |         exit(0);
39 |     i32 unzip_max = row.max;
40 | 
41 |     // open output files
42 |     FILE *files[unzip_max + 1];
43 |     SNPRINTF(num_columns_str, sizeof(num_columns_str), "%d", unzip_max + 1);
44 |     for (i32 i = 0; i <= unzip_max; i++) {
45 |         SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_columns_str), i + 1);
46 |         FOPEN(files[i], path, "wb");
47 |     }
48 | 
49 |     // setup output
50 |     writebuf_t wbuf = wbuf_init(files, unzip_max + 1, lz4);
51 | 
52 |     // output first row
53 |     for (i32 i = 0; i <= unzip_max; i++) {
54 |         new.sizes[0] = row.sizes[i];
55 |         new.columns[0] = row.columns[i];
56 |         dump(&wbuf, &new, i);
57 |     }
58 | 
59 |     // load the next row in case we need to stop
60 |     load_next(&rbuf, &row, 0);
61 | 
62 |     // process the rest of input row by row
63 |     while (!row.stop) {
64 |         ASSERT(row.max == unzip_max, "fatal: unzip found a bad row, needed max %d, got: %d\n", unzip_max, row.max);
65 |         for (i32 i = 0; i <= unzip_max; i++) {
66 |             new.sizes[0] = row.sizes[i];
67 |             new.columns[0] = row.columns[i];
68 |             dump(&wbuf, &new, i);
69 |         }
70 |         load_next(&rbuf, &row, 0);
71 |     }
72 | 
73 |     // flush and close
74 |     for (i32 i = 0; i <= unzip_max; i++) {
75 |         dump_flush(&wbuf, i);
76 |         ASSERT(fclose(files[i]) != EOF, "fatal: failed to close files\n");
77 |         SNPRINTF(path, sizeof(path), "%s_%0*d", prefix, strlen(num_columns_str), i + 1);
78 |         FPRINTF(stdout, "%s\n", path);
79 |     }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/bzip.c:
--------------------------------------------------------------------------------
  1 | #include "util.h"
  2 | #include "argh.h"
  3 | #include "load.h"
  4 | #include "array.h"
  5 | #include "dump.h"
  6 | 
  7 | #define DESCRIPTION "combine single column inputs into a multi column output\n\n"
  8 | #define USAGE "ls column_* | bzip [COL1,...COLN] [-l|--lz4]\n\n"
  9 | #define EXAMPLE ">> echo '\na,b,c\n1,2,3\n' | bsv | bunzip column && ls column_* | bzip 1,3 | csv\na,c\n1,3\n"
 10 | 
 11 | int main(int argc, char **argv) {
 12 | 
 13 |     // setup bsv
 14 |     SETUP();
 15 | 
 16 |     // parse args
 17 |     bool lz4 = false;
 18 |     ARGH_PARSE {
 19 |         ARGH_NEXT();
 20 |         if ARGH_BOOL("-l", "--lz4") { lz4 = true; }
 21 |     }
 22 | 
 23 |     // setup input, filenames come in on stdin
 24 |     ARRAY_INIT(files, FILE*);
 25 |     ARRAY_INIT(filename, u8);
 26 |     u8 tmp;
 27 |     FILE* file;
 28 |     i32 size;
 29 |     while (1) {
 30 |         size = fread_unlocked(&tmp, 1, 1, stdin);
 31 |         if (size != 1)
 32 |             break;
 33 |         if (tmp == '\n' || tmp == ' ') {
 34 |             if (ARRAY_SIZE(filename) > 0) {
 35 |                 ARRAY_APPEND(filename, '\0', u8);
 36 |                 FOPEN(file, filename, "rb");
 37 |                 ARRAY_APPEND(files, file, FILE*);
 38 |                 ARRAY_RESET(filename);
 39 |             }
 40 |         } else {
 41 |             ARRAY_APPEND(filename, tmp, u8);
 42 |         }
 43 |     }
 44 |     readbuf_t rbuf = rbuf_init(files, ARRAY_SIZE(files), lz4);
 45 | 
 46 |     // parse selection
 47 |     ARRAY_INIT(selected, i32);
 48 |     u8 *f;
 49 |     i32 column;
 50 |     switch (ARGH_ARGC) {
 51 |         // default is all columns
 52 |         case 0:
 53 |             for (i32 i = 0; i < ARRAY_SIZE(files); i++)
 54 |                 ARRAY_APPEND(selected, i, i32);
 55 |             break;
 56 |         // otherwise choose columns
 57 |         case 1:
 58 |             while ((f = strsep(&ARGH_ARGV[0], ","))) {
 59 |                 column = atoi(f);
 60 |                 ASSERT(column > 0, "fatal: bad column selection, should be like: '1,2,3' and cannot select below column 1.\n");
 61 |                 ASSERT(column <= ARRAY_SIZE(files), "fatal: bad column selection, should be like: '1,2,3' and cannot select above column %d.\n", ARRAY_SIZE(files));
 62 |                 ARRAY_APPEND(selected, column - 1, i32);
 63 |             }
 64 |             i32 used[ARRAY_SIZE(files)];
 65 |             for (i32 i = 0; i < ARRAY_SIZE(files); i++)
 66 |                 used[i] = -1;
 67 |             for (i32 i = 0; i < ARRAY_SIZE(selected); i++) {
 68 |                 ASSERT(used[selected[i]] == -1, "fatal: can only select columns once, got dupe for column: %d\n", selected[i] + 1);
 69 |                 used[selected[i]] = 1;
 70 |             }
 71 |             break;
 72 |     }
 73 | 
 74 |     // setup state
 75 |     row_t row;
 76 |     row_t new;
 77 |     new.max = ARRAY_SIZE(selected) - 1;
 78 |     i32 stops[ARRAY_SIZE(selected)];
 79 |     i32 do_stop[ARRAY_SIZE(selected)];
 80 |     i32 dont_stop[ARRAY_SIZE(selected)];
 81 |     for (i32 i = 0; i < ARRAY_SIZE(selected); i++) {
 82 |         do_stop[i] = 1;
 83 |         dont_stop[i] = 0;
 84 |     }
 85 | 
 86 |     // setup output
 87 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1, false);
 88 | 
 89 |     // process input row by row
 90 |     while (1) {
 91 |         for (i32 i = 0; i < ARRAY_SIZE(selected); i++) {
 92 |             load_next(&rbuf, &row, selected[i]);
 93 |             ASSERT(row.max == 0, "fatal: tried to zip a row with more than 1 column\n");
 94 |             new.sizes[i] = row.sizes[0];
 95 |             new.columns[i] = row.columns[0];
 96 |             stops[i] = row.stop;
 97 |         }
 98 |         if (memcmp(stops, dont_stop, ARRAY_SIZE(selected) * sizeof(i32)) != 0) {
 99 |             ASSERT(memcmp(stops, do_stop, ARRAY_SIZE(selected) * sizeof(i32)) == 0, "fatal: all columns didn't end at the same length\n");
100 |             break;
101 |         }
102 |         dump(&wbuf, &new, 0);
103 |     }
104 |     dump_flush(&wbuf, 0);
105 | 
106 | }
107 | 


--------------------------------------------------------------------------------
/src/csv.c:
--------------------------------------------------------------------------------
 1 | #include "load.h"
 2 | #include "write_simple.h"
 3 | 
 4 | #define DESCRIPTION "convert bsv to csv\n\n"
 5 | #define USAGE "... | csv\n\n"
 6 | #define EXAMPLE ">> echo a,b,c | bsv | csv\na,b,c\n"
 7 | 
 8 | int main(int argc, char **argv) {
 9 | 
10 |     // setup bsv
11 |     SETUP();
12 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1, false);
13 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1);
14 | 
15 |     // setup state
16 |     row_t row;
17 |     i32 ran = 0;
18 | 
19 |     // process input row by row
20 |     while (1) {
21 |         load_next(&rbuf, &row, 0);
22 |         if (row.stop)
23 |             break;
24 |         for (i32 i = 0; i <= row.max; i++) {
25 |             write_bytes(&wbuf, row.columns[i], row.sizes[i], 0);
26 |             if (i != row.max)
27 |                 write_bytes(&wbuf, ",", 1, 0);
28 |         }
29 |         write_bytes(&wbuf, "\n", 1, 0);
30 |         ran = 1;
31 |     }
32 |     if (ran == 0)
33 |         write_bytes(&wbuf, "\n", 1, 0);
34 |     write_flush(&wbuf, 0);
35 | }
36 | 


--------------------------------------------------------------------------------
/src/xxh3.c:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include "read_simple.h"
 3 | #include "write_simple.h"
 4 | #include "argh.h"
 5 | #include "xxh3.h"
 6 | 
 7 | #define DESCRIPTION "xxh3_64 hash stdin\n\n"
 8 | #define USAGE "... | xxh3 [--stream|--int]\n\n"
 9 | #define EXAMPLE                                                       \
10 |     "  --stream pass stdin through to stdout with hash on stderr\n\n" \
11 |     "  --int output hash as int not hash\n\n"                         \
12 |     ">> echo abc | xxh3\n079364cbfdf9f4cb\n"
13 | 
14 | int main(int argc, char **argv) {
15 | 
16 |     // setup bsv
17 |     SETUP();
18 |     readbuf_t rbuf = rbuf_init((FILE*[]){stdin}, 1);
19 |     writebuf_t wbuf = wbuf_init((FILE*[]){stdout}, 1);
20 | 
21 |     // parse args
22 |     bool int_out = false;
23 |     bool stream = false;
24 |     ARGH_PARSE {
25 |         ARGH_NEXT();
26 |         if      ARGH_BOOL("-i", "--int")    { int_out = true; }
27 |         else if ARGH_BOOL("-s", "--stream") { stream = true; }
28 |     }
29 | 
30 |     // setup state
31 |     XXH3_state_t state;
32 |     ASSERT(XXH3_64bits_reset(&state) != XXH_ERROR, "xxh3 reset failed\n");
33 | 
34 |     // process input row by row
35 |     while (1) {
36 |         read_bytes(&rbuf, BUFFER_SIZE, 0);
37 |         ASSERT(XXH3_64bits_update(&state, rbuf.buffer, rbuf.bytes) != XXH_ERROR, "xxh3 update failed\n");
38 |         if (stream)
39 |             write_bytes(&wbuf, rbuf.buffer, rbuf.bytes, 0);
40 |         if (BUFFER_SIZE != rbuf.bytes)
41 |             break;
42 |     }
43 | 
44 |     //
45 |     u64 hash = XXH3_64bits_digest(&state);
46 |     FILE *out = (stream) ? stderr : stdout;
47 |     if (int_out)
48 |         FPRINTF(out, "%lu\n", hash);
49 |     else
50 |         FPRINTF(out, "%08x%08x\n", (i32)(hash>>32), (i32)hash);
51 |     if (stream)
52 |         write_flush(&wbuf, 0);
53 | }
54 | 


--------------------------------------------------------------------------------
/test/_queue_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | import queue
 4 | from hypothesis import given, settings
 5 | from hypothesis.strategies import composite, integers, sampled_from, randoms
 6 | from test_util import run, clone_source
 7 | 
 8 | def setup_module(m):
 9 |     m.tempdir = clone_source()
10 |     m.orig = os.getcwd()
11 |     m.path = os.environ['PATH']
12 |     os.chdir(m.tempdir)
13 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
14 |     shell.run('make clean', stream=True)
15 |     shell.run('make _queue')
16 | 
17 | def teardown_module():
18 |     with shell.climb_git_root():
19 |         shell.run('make clean', stream=True)
20 | 
21 | @composite
22 | def inputs(draw):
23 |     capacity = draw(integers(min_value=1, max_value=16))
24 |     num_actions = draw(integers(min_value=0, max_value=256))
25 |     rand = draw(randoms())
26 |     actions = []
27 |     for _ in range(num_actions):
28 |         possible_actions = [
29 |             f'put {rand.randint(0, 999)}',
30 |             'get',
31 |         ]
32 |         actions.append(draw(sampled_from(possible_actions)))
33 |     return capacity, actions
34 | 
35 | def expected(arg):
36 |     capacity, actions = arg
37 |     res = []
38 |     q = queue.Queue(capacity)
39 |     for action in actions:
40 |         if action == 'get':
41 |             try:
42 |                 res.append(q.get_nowait())
43 |             except queue.Empty:
44 |                 res.append('empty')
45 |         elif action.split()[0] == 'put':
46 |             try:
47 |                 q.put_nowait(action.split()[1])
48 |             except queue.Full:
49 |                 res.append('full')
50 |         else:
51 |             assert False, action
52 |     return '\n'.join(map(str, res))
53 | 
54 | @given(inputs())
55 | @settings(max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60))
56 | def test_props(arg):
57 |     capacity, actions = arg
58 |     result = expected(arg)
59 |     assert result == run('\n'.join(actions) + '\n', '_queue', capacity).strip()
60 | 


--------------------------------------------------------------------------------
/test/bcat_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | from test_util import unindent, rm_whitespace, clone_source
 4 | 
 5 | def setup_module(m):
 6 |     m.tempdir = clone_source()
 7 |     m.orig = os.getcwd()
 8 |     m.path = os.environ['PATH']
 9 |     os.chdir(m.tempdir)
10 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
11 |     shell.run('make clean && make bsv csv bcat', stream=True)
12 | 
13 | def teardown_module(m):
14 |     os.chdir(m.orig)
15 |     os.environ['PATH'] = m.path
16 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
17 |     shell.run('rm -rf', m.tempdir)
18 | 
19 | def test_basic():
20 |     with shell.tempdir():
21 |         shell.run('for char in a a b b c c; do echo $char | bsv >> $char; done')
22 |         stdout = """
23 |         a:a
24 |         b:b
25 |         c:c
26 |         """
27 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 1 a b c')
28 |         stdout = """
29 |         a:a
30 |         a:a
31 |         b:b
32 |         b:b
33 |         c:c
34 |         c:c
35 |         """
36 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix --head 2 a b c')
37 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 2 --prefix a b c')
38 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat --prefix a b c')
39 |         stdout = """
40 |         a
41 |         b
42 |         c
43 |         """
44 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat --head 1 a b c')
45 |         stdout = """
46 |         a
47 |         a
48 |         b
49 |         b
50 |         c
51 |         c
52 |         """
53 |         assert rm_whitespace(unindent(stdout)) == shell.run('bcat a b c')
54 | 


--------------------------------------------------------------------------------
/test/bcombine_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import shell
 4 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 5 | 
 6 | def setup_module(m):
 7 |     m.tempdir = clone_source()
 8 |     m.orig = os.getcwd()
 9 |     m.path = os.environ['PATH']
10 |     os.chdir(m.tempdir)
11 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
12 |     shell.run('make clean && make bsv csv bcombine', stream=True)
13 | 
14 | def teardown_module(m):
15 |     os.chdir(m.orig)
16 |     os.environ['PATH'] = m.path
17 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
18 |     shell.run('rm -rf', m.tempdir)
19 | 
20 | def test_basic1():
21 |     stdin = """
22 |     a,b,c,d
23 |     1,2,3
24 |     x,y
25 |     """
26 |     stdout = """
27 |     a:b,a,b,c,d
28 |     1:2,1,2,3
29 |     x:y,x,y
30 |     """
31 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcombine 1,2 | csv')
32 | 
33 | def test_basic2():
34 |     stdin = """
35 |     a,b,c,d
36 |     1,2,3
37 |     x,y
38 |     """
39 |     stdout = """
40 |     b:a,a,b,c,d
41 |     2:1,1,2,3
42 |     y:x,x,y
43 |     """
44 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcombine 2,1 | csv')
45 | 
46 | def test_basic3():
47 |     stdin = """
48 |     a,b,c,d
49 |     1,2,3
50 |     x
51 |     """
52 |     with pytest.raises(Exception):
53 |         run(rm_whitespace(stdin), 'bsv | bcombine 2,1 | csv')
54 | 


--------------------------------------------------------------------------------
/test/bcounteach_hash_test.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | import collections
 4 | import string
 5 | import shell
 6 | from hypothesis.database import ExampleDatabase
 7 | from hypothesis import given, settings, HealthCheck
 8 | from hypothesis.strategies import text, lists, composite, integers, randoms, sampled_from
 9 | from test_util import run, rm_whitespace, rm_whitespace, clone_source, compile_buffer_sizes
10 | 
11 | if os.environ.get('TEST_FACTOR'):
12 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
13 | else:
14 |     buffers = [128]
15 | 
16 | def setup_module(m):
17 |     m.tempdir = clone_source()
18 |     m.orig = os.getcwd()
19 |     m.path = os.environ['PATH']
20 |     os.chdir(m.tempdir)
21 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
22 |     shell.run('make clean', stream=True)
23 |     compile_buffer_sizes('csv', buffers)
24 |     compile_buffer_sizes('bsv', buffers)
25 |     compile_buffer_sizes('bsort', buffers)
26 |     compile_buffer_sizes('bschema', buffers)
27 |     compile_buffer_sizes('bcounteach-hash', buffers)
28 |     shell.run('make bsv csv bsort bschema bcounteach-hash', stream=True)
29 | 
30 | def teardown_module(m):
31 |     os.chdir(m.orig)
32 |     os.environ['PATH'] = m.path
33 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
34 |     shell.run('rm -rf', m.tempdir)
35 | 
36 | @composite
37 | def inputs(draw):
38 |     buffer = draw(sampled_from(buffers))
39 |     random = draw(randoms())
40 |     num_columns = draw(integers(min_value=1, max_value=4))
41 |     max_repeats = draw(integers(min_value=1, max_value=3))
42 |     column = text(string.ascii_lowercase, min_size=1, max_size=20)
43 |     line = lists(column, min_size=num_columns, max_size=num_columns)
44 |     lines = draw(lists(line))
45 |     lines = [','.join(x) for x in lines]
46 |     lines = [l
47 |              for line in lines
48 |              for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))]
49 |     return buffer, '\n'.join(lines) + '\n'
50 | 
51 | def expected(csv):
52 |     lines = csv.splitlines()
53 |     lines = [x.split(',')[0] for x in lines]
54 |     counts = collections.Counter(lines)
55 |     return '\n'.join(f'{k},{counts[k]}' for k in sorted(counts) if k) + '\n'
56 | 
57 | @given(inputs())
58 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), suppress_health_check=HealthCheck.all()) # type: ignore
59 | def test_props(args):
60 |     buffer, csv = args
61 |     result = expected(csv)
62 |     assert result == run(csv, f'bsv.{buffer} | bcounteach-hash.{buffer} | bschema.{buffer} *,i64:a | bsort.{buffer} | csv.{buffer}')
63 | 
64 | def test_basic():
65 |     stdin = """
66 |     a
67 |     a
68 |     a
69 |     b
70 |     b
71 |     a
72 |     """
73 |     stdout = """
74 |     a,4
75 |     b,2
76 |     """
77 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcounteach-hash | bschema *,i64:a | bsort | csv')
78 | 


--------------------------------------------------------------------------------
/test/bcounteach_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings, HealthCheck
 6 | from hypothesis.strategies import text, lists, composite, integers, randoms
 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bschema bcounteach', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     random = draw(randoms())
26 |     num_columns = draw(integers(min_value=1, max_value=4))
27 |     max_repeats = draw(integers(min_value=1, max_value=3))
28 |     column = text(string.ascii_lowercase, min_size=1, max_size=64)
29 |     line = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(line))
31 |     lines = [','.join(x) for x in lines]
32 |     lines = [l
33 |              for line in lines
34 |              for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))]
35 |     return '\n'.join(lines) + '\n'
36 | 
37 | def expected(csv):
38 |     lines = csv.splitlines()
39 |     lines = [x.split(',')[0] for x in lines]
40 |     result = []
41 |     count = 0
42 |     last = None
43 |     for line in lines:
44 |         if line:
45 |             if last is not None and last != line:
46 |                 result.append(f'{last},{count}')
47 |                 count = 0
48 |             last = line
49 |             count += 1
50 |     if last:
51 |         result.append(f'{last},{count}')
52 |     return '\n'.join(result) + '\n'
53 | 
54 | @given(inputs())
55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), suppress_health_check=HealthCheck.all()) # type: ignore
56 | def test_props(args):
57 |     csv = args
58 |     result = expected(csv)
59 |     assert result == run(csv, 'bsv | bcounteach | bschema *,i64:a | csv')
60 | 
61 | def test_basic():
62 |     stdin = """
63 |     a
64 |     a
65 |     a
66 |     b
67 |     b
68 |     a
69 |     """
70 |     stdout = """
71 |     a,3
72 |     b,2
73 |     a,1
74 |     """
75 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcounteach | bschema *,i64:a | csv')
76 | 


--------------------------------------------------------------------------------
/test/bcountrows_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 4 | 
 5 | def setup_module(m):
 6 |     m.tempdir = clone_source()
 7 |     m.orig = os.getcwd()
 8 |     m.path = os.environ['PATH']
 9 |     os.chdir(m.tempdir)
10 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
11 |     shell.run('make clean && make bsv csv bschema bcountrows', stream=True)
12 | 
13 | def teardown_module(m):
14 |     os.chdir(m.orig)
15 |     os.environ['PATH'] = m.path
16 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
17 |     shell.run('rm -rf', m.tempdir)
18 | 
19 | def test_basic():
20 |     stdin = """
21 |     a
22 |     a
23 |     a
24 |     b
25 |     b
26 |     a
27 |     """
28 |     stdout = """
29 |     6
30 |     """
31 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bcountrows | bschema i64:a | csv')
32 | 


--------------------------------------------------------------------------------
/test/bdedupe_hash_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import text, lists, composite, integers, randoms
 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bdedupe-hash', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     random = draw(randoms())
26 |     num_columns = draw(integers(min_value=1, max_value=8))
27 |     max_repeats = draw(integers(min_value=1, max_value=3))
28 |     column = text(string.ascii_lowercase, min_size=1, max_size=64)
29 |     line = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(line))
31 |     lines = [','.join(x) for x in lines]
32 |     lines = [l
33 |              for line in lines
34 |              for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))]
35 |     return '\n'.join(lines) + '\n'
36 | 
37 | def expected(csv):
38 |     lines = csv.splitlines()
39 |     seen = set()
40 |     result = []
41 |     for line in lines:
42 |         key = line.split(',')[0]
43 |         if key not in seen:
44 |             seen.add(key)
45 |             result.append(line)
46 |     return '\n'.join(result) + '\n'
47 | 
48 | @given(inputs())
49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
50 | def test_props(args):
51 |     csv = args
52 |     result = expected(csv)
53 |     assert result == run(csv, 'bsv | bdedupe-hash | csv')
54 | 
55 | def test_basic():
56 |     stdin = """
57 |     a,1
58 |     a,2
59 |     a,3
60 |     b,4
61 |     b,5
62 |     a,6
63 |     a,7
64 |     """
65 |     stdout = """
66 |     a,1
67 |     b,4
68 |     """
69 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bdedupe-hash | csv')
70 | 


--------------------------------------------------------------------------------
/test/bdedupe_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import text, lists, composite, integers, randoms
 7 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bdedupe', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     random = draw(randoms())
26 |     num_columns = draw(integers(min_value=1, max_value=8))
27 |     max_repeats = draw(integers(min_value=1, max_value=3))
28 |     column = text(string.ascii_lowercase, min_size=1, max_size=64)
29 |     line = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(line))
31 |     lines = [','.join(x) for x in lines]
32 |     lines = [l
33 |              for line in lines
34 |              for l in [line] * (1 if random.random() > .5 else random.randint(1, max_repeats))]
35 |     return '\n'.join(lines) + '\n'
36 | 
37 | def expected(csv):
38 |     lines = csv.splitlines()
39 |     result = []
40 |     for line in lines:
41 |         if not result or result[-1].split(',')[0] != line.split(',')[0]:
42 |             result.append(line)
43 |     return '\n'.join(result) + '\n'
44 | 
45 | @given(inputs())
46 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
47 | def test_props(args):
48 |     csv = args
49 |     result = expected(csv)
50 |     assert result == run(csv, 'bsv | bdedupe | csv')
51 | 
52 | def test_basic():
53 |     stdin = """
54 |     a,1
55 |     a,2
56 |     a,3
57 |     b,4
58 |     b,5
59 |     a,6
60 |     a,7
61 |     """
62 |     stdout = """
63 |     a,1
64 |     b,4
65 |     a,6
66 |     """
67 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bdedupe | csv')
68 | 


--------------------------------------------------------------------------------
/test/bdropuntil_i64_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | import string
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text
 7 | from test_util import run, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bschema bsort bdropuntil', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     r = draw(randoms())
26 |     num_columns = draw(integers(min_value=1, max_value=4))
27 |     column = integers(min_value=-9223372036854775806, max_value=9223372036854775806)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line, min_size=1))
30 |     lines = [[str(x) for x in line] for line in lines]
31 |     first_column_values = [line[0] for line in lines]
32 |     threshold = draw(floats(min_value=0, max_value=1))
33 |     for line in lines:
34 |         if line and r.random() > threshold:
35 |             line[0] = r.choice(first_column_values)
36 |     csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n'
37 |     value = r.choice(first_column_values)
38 |     return value, csv
39 | 
40 | def expected(value, csv):
41 |     value = int(value)
42 |     res = []
43 |     found = False
44 |     lines = csv.splitlines()
45 |     lines = [[int(x) for x in l.split(',')] for l in lines]
46 |     lines = sorted(lines)
47 |     for cols in lines:
48 |         if found:
49 |             res.append(cols[0])
50 |         else:
51 |             if cols:
52 |                 if cols[0] >= value:
53 |                     res.append(cols[0])
54 |                     found = True
55 |     return '\n'.join(map(str, res)) + '\n'
56 | 
57 | @given(inputs())
58 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
59 | def test_props(args):
60 |     value, csv = args
61 |     result = expected(value, csv)
62 |     assert result.splitlines() == run(csv, f'bsv | bschema a:i64,... | bsort i64 | bdropuntil "{value}" i64 | bschema i64:a | csv').splitlines()
63 | 


--------------------------------------------------------------------------------
/test/bdropuntil_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | import shell
  4 | import random
  5 | from hypothesis.database import ExampleDatabase
  6 | from hypothesis import given, settings
  7 | from hypothesis.strategies import text, lists, composite, integers, randoms, floats, sampled_from
  8 | from test_util import run, clone_source, compile_buffer_sizes
  9 | 
 10 | if os.environ.get('TEST_FACTOR'):
 11 |     buffers = list(sorted(set([12, 17, 64, 256, 1024, 1024 * 1024 * 5] + [random.randint(8, 1024) for _ in range(10)])))
 12 | else:
 13 |     buffers = [128]
 14 | 
 15 | def setup_module(m):
 16 |     m.tempdir = clone_source()
 17 |     m.orig = os.getcwd()
 18 |     m.path = os.environ['PATH']
 19 |     os.chdir(m.tempdir)
 20 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
 21 |     shell.run('make clean', stream=True)
 22 |     compile_buffer_sizes('csv', buffers)
 23 |     compile_buffer_sizes('bsv', buffers)
 24 |     compile_buffer_sizes('bsort', buffers)
 25 |     compile_buffer_sizes('bdropuntil', buffers)
 26 |     shell.run('make bsv csv bsort bdropuntil', stream=True)
 27 | 
 28 | def teardown_module(m):
 29 |     os.chdir(m.orig)
 30 |     os.environ['PATH'] = m.path
 31 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
 32 |     shell.run('rm -rf', m.tempdir)
 33 | 
 34 | def partition(r, n, x):
 35 |     res = []
 36 |     ks = list(sorted({r.randint(1, max(1, len(x))) for _ in range(n)}))
 37 |     ks = [0] + ks
 38 |     ks[-1] = len(x)
 39 |     for a, b in zip(ks, ks[1:]):
 40 |         res.append(x[a:b])
 41 |     return res
 42 | 
 43 | @composite
 44 | def inputs(draw):
 45 |     r = draw(randoms())
 46 |     buffer = draw(sampled_from(buffers))
 47 |     num_text_columns = draw(integers(min_value=1, max_value=4))
 48 |     text_column = text(string.ascii_lowercase, min_size=1, max_size=8)
 49 |     text_line = lists(text_column, min_size=num_text_columns, max_size=num_text_columns)
 50 |     lines = draw(lists(text_line, min_size=1))
 51 |     first_column_values = [line[0] for line in lines]
 52 |     threshold = draw(floats(min_value=0, max_value=1))
 53 |     for line in lines:
 54 |         if line and r.random() > threshold:
 55 |             line[0] = r.choice(first_column_values)
 56 |     csv = '\n'.join([','.join(l)[:buffer // 4] for l in lines if l]).strip() + '\n'
 57 |     value = r.choice(first_column_values)
 58 |     return value, csv, buffer
 59 | 
 60 | def expected(value, csv):
 61 |     res = []
 62 |     found = False
 63 |     lines = csv.splitlines()
 64 |     lines = [l.split(',') for l in lines]
 65 |     lines = sorted(lines, key=lambda x: x[0])
 66 |     for cols in lines:
 67 |         line = ','.join(str(c) for c in cols)
 68 |         if found:
 69 |             res.append(line)
 70 |         else:
 71 |             if cols:
 72 |                 if cols[0] >= value:
 73 |                     res.append(line)
 74 |                     found = True
 75 |     return '\n'.join(res) + '\n'
 76 | 
 77 | @given(inputs())
 78 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
 79 | def test_props(args):
 80 |     value, csv, buffer = args
 81 |     result = expected(value, csv)
 82 |     assert set(result.splitlines()) == set(run(csv, f'bsv.{buffer} | bsort.{buffer} | bdropuntil.{buffer} "{value}" | csv.{buffer}').splitlines()) # set because sort is not stable and is only for first column values
 83 | 
 84 | def test_example1():
 85 |     value, csv = 'g', 'a\nb\nc\nd\ne\nf\ng\nh\n'
 86 |     result = expected(value, csv)
 87 |     assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null')
 88 | 
 89 | def test_example2():
 90 |     value, csv = 'a', 'a\n'
 91 |     result = expected(value, csv)
 92 |     assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null')
 93 | 
 94 | def test_example3():
 95 |     value, csv = 'ga', 'a\nb\nc\nddd\neee\nf\nga\n'
 96 |     result = expected(value, csv)
 97 |     assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null')
 98 | 
 99 | def test_example4():
100 |     value, csv = 'b', 'a\na\na\nb\n'
101 |     result = expected(value, csv)
102 |     assert result == run(csv, f'bsv 2>/dev/null | bsort | bdropuntil "{value}" | csv 2>/dev/null')
103 | 


--------------------------------------------------------------------------------
/test/blz4d_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | import random
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import lists, composite, integers, text, sampled_from
 8 | from test_util import clone_source, compile_buffer_sizes
 9 | 
10 | if os.environ.get('TEST_FACTOR'):
11 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
12 | else:
13 |     buffers = [128]
14 | 
15 | def setup_module(m):
16 |     m.tempdir = clone_source()
17 |     m.orig = os.getcwd()
18 |     m.path = os.environ['PATH']
19 |     os.chdir(m.tempdir)
20 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
21 |     shell.run('make clean && make bsv csv blz4 blz4d', stream=True)
22 |     compile_buffer_sizes('csv', buffers)
23 |     compile_buffer_sizes('bsv', buffers)
24 |     compile_buffer_sizes('blz4', buffers)
25 |     compile_buffer_sizes('blz4d', buffers)
26 | 
27 | def teardown_module(m):
28 |     os.chdir(m.orig)
29 |     os.environ['PATH'] = m.path
30 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
31 |     shell.run('rm -rf', m.tempdir)
32 | 
33 | @composite
34 | def inputs(draw):
35 |     buffer = draw(sampled_from(buffers))
36 |     num_columns = draw(integers(min_value=1, max_value=12))
37 |     column = text(string.ascii_lowercase, min_size=1)
38 |     columns = lists(column, min_size=num_columns, max_size=num_columns)
39 |     lines = draw(lists(columns, min_size=1))
40 |     csv = '\n'.join([','.join(line)[:64] for line in lines])
41 |     return buffer, csv
42 | 
43 | @given(inputs())
44 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
45 | def test_props(args):
46 |     buffer, csv = args
47 |     assert csv == shell.run(f'bsv.{buffer} | blz4.{buffer} | blz4d.{buffer} | csv.{buffer}', stdin=csv)
48 | 


--------------------------------------------------------------------------------
/test/bpartition_lz4_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | import shell
  4 | import collections
  5 | import xxh3
  6 | from hypothesis.database import ExampleDatabase
  7 | from hypothesis import given, settings
  8 | from hypothesis.strategies import text, lists, composite, integers, tuples
  9 | from test_util import unindent, rm_whitespace, clone_source
 10 | 
 11 | def setup_module(m):
 12 |     m.tempdir = clone_source()
 13 |     m.orig = os.getcwd()
 14 |     m.path = os.environ['PATH']
 15 |     os.chdir(m.tempdir)
 16 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
 17 |     shell.run('make clean && make bsv csv blz4d bcat bpartition', stream=True)
 18 | 
 19 | def teardown_module(m):
 20 |     os.chdir(m.orig)
 21 |     os.environ['PATH'] = m.path
 22 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
 23 |     shell.run('rm -rf', m.tempdir)
 24 | 
 25 | @composite
 26 | def inputs(draw):
 27 |     num_buckets = draw(integers(min_value=1, max_value=128))
 28 |     num_columns = draw(integers(min_value=1, max_value=12))
 29 |     column = text(string.ascii_lowercase, min_size=1)
 30 |     columns = lists(column, min_size=1, max_size=num_columns)
 31 |     lines = draw(lists(columns, min_size=1))
 32 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
 33 |     return num_buckets, csv
 34 | 
 35 | def expected(num_buckets, csv):
 36 |     res = collections.defaultdict(list)
 37 |     size = len(str(num_buckets))
 38 |     for line in csv.splitlines():
 39 |         col0 = line.split(',', 1)[0]
 40 |         bucket = xxh3.oneshot_int(col0.encode()) % num_buckets
 41 |         res[str(bucket).zfill(size)].append(line)
 42 |     val = ''
 43 |     for k in sorted(res):
 44 |         for line in res[k]:
 45 |             val += f'prefix_{k}:{line}\n'
 46 |     return val.strip()
 47 | 
 48 | @given(inputs())
 49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
 50 | def test_props(args):
 51 |     num_buckets, csv = args
 52 |     result = expected(num_buckets, csv)
 53 |     with shell.tempdir():
 54 |         stdout = '\n'.join(sorted({l.split(':')[0] for l in result.splitlines()}))
 55 |         assert stdout == shell.run(f'bsv | bpartition -l {num_buckets} prefix', stdin=csv, echo=True)
 56 |         assert result == shell.run('bcat -l -p prefix*')
 57 | 
 58 | def test_without_prefix():
 59 |     with shell.tempdir():
 60 |         stdin = """
 61 |         b,c,d
 62 |         e,f,g
 63 |         h,i,j
 64 |         """
 65 |         stdout = """
 66 |         02
 67 |         04
 68 |         05
 69 |         """
 70 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10', stdin=unindent(stdin))
 71 | 
 72 | def test_basic():
 73 |     with shell.tempdir():
 74 |         stdin = """
 75 |         b,c,d
 76 |         e,f,g
 77 |         h,i,j
 78 |         """
 79 |         stdout = """
 80 |         prefix_02
 81 |         prefix_04
 82 |         prefix_05
 83 |         """
 84 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin))
 85 |         stdout = """
 86 |         prefix_02:h,i,j
 87 |         prefix_04:e,f,g
 88 |         prefix_05:b,c,d
 89 |         """
 90 |         assert unindent(stdout).strip() == shell.run('bcat -l -p prefix*')
 91 |         stdout = """
 92 |         prefix_02
 93 |         prefix_04
 94 |         prefix_05
 95 |         """
 96 |         assert unindent(stdout).strip() == shell.run('ls prefix*')
 97 | 
 98 | def test_appends():
 99 |     with shell.tempdir():
100 |         stdin = """
101 |         b,c,d
102 |         e,f,g
103 |         h,i,j
104 |         """
105 |         stdout = """
106 |         prefix_02
107 |         prefix_04
108 |         prefix_05
109 |         """
110 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin))
111 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition -l 10 prefix', stdin=unindent(stdin))
112 |         stdout = """
113 |         prefix_02:h,i,j
114 |         prefix_02:h,i,j
115 |         prefix_04:e,f,g
116 |         prefix_04:e,f,g
117 |         prefix_05:b,c,d
118 |         prefix_05:b,c,d
119 |         """
120 |         assert unindent(stdout).strip() == shell.run('bcat -l -p prefix*')
121 |         stdout = """
122 |         prefix_02
123 |         prefix_04
124 |         prefix_05
125 |         """
126 |         assert unindent(stdout).strip() == shell.run('ls prefix*')
127 | 


--------------------------------------------------------------------------------
/test/bpartition_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | import shell
  4 | import collections
  5 | import xxh3
  6 | from hypothesis.database import ExampleDatabase
  7 | from hypothesis import given, settings
  8 | from hypothesis.strategies import text, lists, composite, integers, tuples
  9 | from test_util import unindent, rm_whitespace, clone_source
 10 | 
 11 | def setup_module(m):
 12 |     m.tempdir = clone_source()
 13 |     m.orig = os.getcwd()
 14 |     m.path = os.environ['PATH']
 15 |     os.chdir(m.tempdir)
 16 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
 17 |     shell.run('make clean && make bsv csv bcat bpartition', stream=True)
 18 | 
 19 | def teardown_module(m):
 20 |     os.chdir(m.orig)
 21 |     os.environ['PATH'] = m.path
 22 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
 23 |     shell.run('rm -rf', m.tempdir)
 24 | 
 25 | @composite
 26 | def inputs(draw):
 27 |     num_buckets = draw(integers(min_value=1, max_value=128))
 28 |     num_columns = draw(integers(min_value=1, max_value=12))
 29 |     column = text(string.ascii_lowercase, min_size=1)
 30 |     columns = lists(column, min_size=1, max_size=num_columns)
 31 |     lines = draw(lists(columns, min_size=1))
 32 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
 33 |     return num_buckets, csv
 34 | 
 35 | def expected(num_buckets, csv):
 36 |     res = collections.defaultdict(list)
 37 |     size = len(str(num_buckets))
 38 |     for line in csv.splitlines():
 39 |         col0 = line.split(',', 1)[0]
 40 |         bucket = xxh3.oneshot_int(col0.encode()) % num_buckets
 41 |         res[str(bucket).zfill(size)].append(line)
 42 |     val = ''
 43 |     for k in sorted(res):
 44 |         for line in res[k]:
 45 |             val += f'prefix_{k}:{line}\n'
 46 |     return val.strip()
 47 | 
 48 | @given(inputs())
 49 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
 50 | def test_props(args):
 51 |     num_buckets, csv = args
 52 |     result = expected(num_buckets, csv)
 53 |     with shell.tempdir():
 54 |         stdout = '\n'.join(sorted({l.split(':')[0] for l in result.splitlines()}))
 55 |         assert stdout == shell.run(f'bsv | bpartition {num_buckets} prefix', stdin=csv, echo=True)
 56 |         assert result == shell.run('bcat --prefix prefix*')
 57 | 
 58 | def test_without_prefix():
 59 |     with shell.tempdir():
 60 |         stdin = """
 61 |         b,c,d
 62 |         e,f,g
 63 |         h,i,j
 64 |         """
 65 |         stdout = """
 66 |         02
 67 |         04
 68 |         05
 69 |         """
 70 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10', stdin=unindent(stdin))
 71 | 
 72 | def test_basic():
 73 |     with shell.tempdir():
 74 |         stdin = """
 75 |         b,c,d
 76 |         e,f,g
 77 |         h,i,j
 78 |         """
 79 |         stdout = """
 80 |         prefix_02
 81 |         prefix_04
 82 |         prefix_05
 83 |         """
 84 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin))
 85 |         stdout = """
 86 |         prefix_02:h,i,j
 87 |         prefix_04:e,f,g
 88 |         prefix_05:b,c,d
 89 |         """
 90 |         assert unindent(stdout).strip() == shell.run('bcat --prefix prefix*')
 91 |         stdout = """
 92 |         prefix_02
 93 |         prefix_04
 94 |         prefix_05
 95 |         """
 96 |         assert unindent(stdout).strip() == shell.run('ls prefix*')
 97 | 
 98 | def test_appends():
 99 |     with shell.tempdir():
100 |         stdin = """
101 |         b,c,d
102 |         e,f,g
103 |         h,i,j
104 |         """
105 |         stdout = """
106 |         prefix_02
107 |         prefix_04
108 |         prefix_05
109 |         """
110 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin))
111 |         assert rm_whitespace(unindent(stdout)) == shell.run('bsv | bpartition 10 prefix', stdin=unindent(stdin))
112 |         stdout = """
113 |         prefix_02:h,i,j
114 |         prefix_02:h,i,j
115 |         prefix_04:e,f,g
116 |         prefix_04:e,f,g
117 |         prefix_05:b,c,d
118 |         prefix_05:b,c,d
119 |         """
120 |         assert unindent(stdout).strip() == shell.run('bcat --prefix prefix*')
121 |         stdout = """
122 |         prefix_02
123 |         prefix_04
124 |         prefix_05
125 |         """
126 |         assert unindent(stdout).strip() == shell.run('ls prefix*')
127 | 


--------------------------------------------------------------------------------
/test/brmerge_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import text, lists, composite, integers
 7 | from test_util import clone_source
 8 | import os
 9 | import shell
10 | 
11 | def setup_module(m):
12 |     m.tempdir = clone_source()
13 |     m.orig = os.getcwd()
14 |     m.path = os.environ['PATH']
15 |     os.chdir(m.tempdir)
16 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
17 |     shell.run('make clean && make bsv csv bsort bcut bmerge', stream=True)
18 | 
19 | def teardown_module(m):
20 |     os.chdir(m.orig)
21 |     os.environ['PATH'] = m.path
22 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
23 |     shell.run('rm -rf', m.tempdir)
24 | 
25 | @composite
26 | def inputs(draw):
27 |     num_inputs = 2
28 |     csvs = []
29 |     for _ in range(num_inputs):
30 |         num_columns = draw(integers(min_value=1, max_value=2))
31 |         column = text(string.ascii_lowercase, min_size=1, max_size=4)
32 |         line = lists(column, min_size=num_columns, max_size=num_columns)
33 |         lines = draw(lists(line))
34 |         csv = '\n'.join(sorted([','.join(x) for x in lines], reverse=True)) + '\n'
35 |         csvs.append(csv)
36 |     return csvs
37 | 
38 | def expected(csvs):
39 |     xs = []
40 |     for csv in csvs:
41 |         xs += csv.splitlines()
42 |     xs = sorted([x.split(',')[0] for x in xs], reverse=True)
43 |     return '\n'.join(xs) + '\n'
44 | 
45 | @given(inputs())
46 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
47 | def test_props(csvs):
48 |     result = expected(csvs)
49 |     if result.strip():
50 |         with shell.tempdir():
51 |             paths = []
52 |             for i, csv in enumerate(csvs):
53 |                 path = f'file{i}.bsv'
54 |                 shell.run(f'bsv > {path}', stdin=csv)
55 |                 paths.append(path)
56 |             assert result.strip() == shell.run('echo', *paths, '| bmerge -r | bcut 1 | csv', echo=True)
57 |             assert shell.run('cat', *paths, '| bsort -r | bcut 1 | csv') == shell.run('echo', *paths, '| bmerge -r | bcut 1 | csv')
58 | 


--------------------------------------------------------------------------------
/test/brsort_f64_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, floats
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut bsort bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     num_columns = draw(integers(min_value=1, max_value=3))
27 |     column = floats(allow_nan=False, min_value=1e-10, max_value=1e10)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line))
30 |     lines = [','.join(map(str, line)) for line in lines]
31 |     return '\n'.join(lines) + '\n'
32 | 
33 | def expected(csv):
34 |     xs = csv.splitlines()
35 |     xs = [float(x.split(',')[0]) for x in xs if x]
36 |     xs = sorted(xs, reverse=True)
37 |     return [round(x, 2) for x in xs]
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(csv):
42 |     result = expected(csv)
43 |     assert result == [round(float(x), 2) for x in run(csv, 'bsv | bschema a:f64,... | bsort f64 -r | bcut 1 | bschema f64:a | csv').splitlines() if x]
44 | 


--------------------------------------------------------------------------------
/test/brsort_i64_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, integers
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut bsort bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     num_columns = draw(integers(min_value=1, max_value=3))
27 |     column = integers(min_value=-9223372036854775806, max_value=9223372036854775806)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line))
30 |     lines = [','.join(map(str, line)) for line in lines]
31 |     return '\n'.join(lines) + '\n'
32 | 
33 | def expected(csv):
34 |     xs = csv.splitlines()
35 |     xs = [int(x.split(',')[0]) for x in xs if x]
36 |     xs = sorted(xs, reverse=True)
37 |     return '\n'.join(map(str, xs)) + '\n'
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(csv):
42 |     result = expected(csv)
43 |     assert result == run(csv, 'bsv | bschema a:i64,... | bsort i64 -r | bcut 1 | bschema i64:a | csv')
44 | 


--------------------------------------------------------------------------------
/test/brsort_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bsort bcut', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     num_columns = draw(integers(min_value=1, max_value=64))
27 |     column = text(string.ascii_lowercase, min_size=1, max_size=64)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line))
30 |     csv = '\n'.join([','.join(x) for x in lines]) + '\n'
31 |     return csv
32 | 
33 | def expected(csv):
34 |     xs = csv.splitlines()
35 |     xs = [x.split(',')[0] for x in xs]
36 |     xs = sorted(xs, reverse=True)
37 |     return '\n'.join(xs) + '\n'
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(csv):
42 |     result = expected(csv)
43 |     if result:
44 |         assert result == run(csv, 'bsv | bsort -r | bcut 1 | csv')
45 |     else:
46 |         with pytest.raises(AssertionError):
47 |             run(csv, 'bsv | bsort -r | bcut 1 | csv')
48 | 
49 | @given(inputs())
50 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
51 | def test_props_compatability(csv):
52 |     assert run(csv, 'LC_ALL=C sort -r -k1,1 | cut -d, -f1') == run(csv, 'bsv | bsort --reversed | bcut 1 | csv')
53 | 
54 | def test_compatability():
55 |     stdin = """
56 |     b
57 |     c
58 |     a
59 |     """
60 |     stdout = """
61 |     c
62 |     b
63 |     a
64 |     """
65 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort --reversed | csv')
66 | 


--------------------------------------------------------------------------------
/test/brtopn_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, integers
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut btopn bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     n = draw(integers(min_value=1, max_value=16))
27 |     num_columns = draw(integers(min_value=1, max_value=3))
28 |     column = text(string.ascii_lowercase, min_size=1, max_size=64)
29 |     line = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(line))
31 |     lines = [','.join(map(str, line)) for line in lines]
32 |     return n, '\n'.join(lines) + '\n'
33 | 
34 | def expected(n, csv):
35 |     xs = csv.splitlines()
36 |     xs = [x.split(',')[0] for x in xs if x]
37 |     xs = sorted(xs)[:n]
38 |     return '\n'.join(xs) + '\n'
39 | 
40 | @given(inputs())
41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
42 | def test_props(args):
43 |     n, csv = args
44 |     result = expected(n, csv)
45 |     assert result == run(csv, f'bsv | btopn {n} -r | bcut 1 | csv ')
46 | 


--------------------------------------------------------------------------------
/test/bschema_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import shell
 4 | from test_util import clone_source
 5 | 
 6 | def setup_module(m):
 7 |     m.tempdir = clone_source()
 8 |     m.orig = os.getcwd()
 9 |     m.path = os.environ['PATH']
10 |     os.chdir(m.tempdir)
11 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
12 |     shell.run('make clean && make bsv csv bschema', stream=True)
13 | 
14 | def teardown_module(m):
15 |     os.chdir(m.orig)
16 |     os.environ['PATH'] = m.path
17 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
18 |     shell.run('rm -rf', m.tempdir)
19 | 
20 | def test_basic():
21 |     assert '0' == shell.run('echo 0 | bsv | bschema a:u16 | bschema u16:a | csv')
22 |     assert '5' == shell.run('echo 5 | bsv | bschema a:u16 | bschema u16:a | csv')
23 |     assert '5' == shell.run('echo 5 | bsv | bschema a:u32 | bschema u32:a | csv')
24 |     assert '5' == shell.run('echo 5 | bsv | bschema a:u64 | bschema u64:a | csv')
25 |     assert '5' == shell.run('echo 5 | bsv | bschema a:i16 | bschema i16:a | csv')
26 |     assert '5' == shell.run('echo 5 | bsv | bschema a:i32 | bschema i32:a | csv')
27 |     assert '5' == shell.run('echo 5 | bsv | bschema a:i64 | bschema i64:a | csv')
28 |     assert '5' == shell.run('echo 5 | bsv | bschema a:f32 | bschema f32:a | csv').split('.')[0]
29 |     assert '5' == shell.run('echo 5 | bsv | bschema a:f64 | bschema f64:a | csv').split('.')[0]
30 |     assert '1' == shell.run('echo 1 | bsv | bschema 1,... | csv')
31 |     assert '1,2' == shell.run('echo 1,2,3 | bsv | bschema 1,1,... | csv')
32 |     with pytest.raises(Exception):
33 |         shell.run('echo 1,2,3 | bsv | bschema fake,schema,errors | csv')
34 |     with pytest.raises(Exception):
35 |         shell.run('echo 1,2,3 | bsv | bschema 1,1 | csv')
36 |     with pytest.raises(Exception):
37 |         shell.run('echo 1,2,3 | bsv | bschema 1,1,1,1 | csv')
38 |     with pytest.raises(Exception):
39 |         shell.run('echo 1,2,3 | bsv | bschema 1,2,1 | csv')
40 |     assert '12593,12850,13107' == shell.run('echo 11,22,33 | bsv | bschema u16:a,u16:a,u16:a | csv')
41 |     assert '1,2,3'    == shell.run('echo 1,2,3 | bsv | bschema 1,1,1 | csv')
42 |     assert '1'    == shell.run('echo 1,2,3 | bsv | bschema 1,... | csv')
43 |     assert '1,2'      == shell.run('echo 1,2,3 | bsv | bschema *,*,... | csv')
44 |     assert '11,22' == shell.run('echo 11,22,33 | bsv | bschema *,*,... | csv')
45 |     assert 'df,er' == shell.run('echo asdf,qwer | bsv | bschema "*2,*2" | csv')
46 |     assert 'as,qw' == shell.run('echo asdf,qwer | bsv | bschema "2*,2*" | csv')
47 |     with pytest.raises(Exception):
48 |         shell.run('echo a,qwer,123 | bsv | bschema "2*,2*" | csv')
49 |     with pytest.raises(Exception):
50 |         shell.run('echo -1 | bsv | bschema "a:u64" | csv')
51 | 
52 | def test_filtering():
53 |     assert '1,1\n2,2' == shell.run('echo -e "1,1\n2,2\n3\n" | bsv | bschema 1,1 --filter | csv')
54 |     assert '22\n33' == shell.run('echo -e "1\n22\n33\n" | bsv | bschema 2 --filter | csv')
55 |     assert '12850\n13107' == shell.run('echo -e "1\n22\n33\n" | bsv | bschema u16:a --filter | csv')
56 |     assert 'as\n12' == shell.run('echo -e "asdf\nq\n123\n" | bsv | bschema "2*" --filter | csv')
57 | 
58 | def test_maxint():
59 |     with pytest.raises(Exception):
60 |         shell.run('echo 32768 | bsv | bschema a:i16')
61 |     with pytest.raises(Exception):
62 |         shell.run('echo -32769 | bsv | bschema a:i16')
63 |     with pytest.raises(Exception):
64 |         shell.run('echo -1 | bsv | bschema a:u16')
65 |     with pytest.raises(Exception):
66 |         shell.run('echo 65536 | bsv | bschema a:u16')
67 |     with pytest.raises(Exception):
68 |         shell.run('echo 2147483648 | bsv | bschema a:i32')
69 |     with pytest.raises(Exception):
70 |         shell.run('echo -2147483649 | bsv | bschema a:i32')
71 |     with pytest.raises(Exception):
72 |         shell.run('echo -1 | bsv | bschema a:u32')
73 |     with pytest.raises(Exception):
74 |         shell.run('echo 4294967296 | bsv | bschema a:u32')
75 |     with pytest.raises(Exception):
76 |         shell.run('echo -9223372036854775808 | bsv | bschema a:i64')
77 |     with pytest.raises(Exception):
78 |         shell.run('echo 9223372036854775807 | bsv | bschema a:i64')
79 |     with pytest.raises(Exception):
80 |         shell.run('echo -1 | bsv | bschema a:u64')
81 |     with pytest.raises(Exception):
82 |         shell.run('echo 18446744073709551615 | bsv | bschema a:u64')
83 | 


--------------------------------------------------------------------------------
/test/bsort_f64_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, floats
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut bsort bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     num_columns = draw(integers(min_value=1, max_value=3))
27 |     column = floats(allow_nan=False, min_value=1e-10, max_value=1e10)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line))
30 |     lines = [','.join(map(str, line)) for line in lines]
31 |     return '\n'.join(lines) + '\n'
32 | 
33 | def expected(csv):
34 |     xs = csv.splitlines()
35 |     xs = [float(x.split(',')[0]) for x in xs if x]
36 |     xs = sorted(xs)
37 |     return [round(x, 2) for x in xs]
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(csv):
42 |     result = expected(csv)
43 |     assert result == [round(float(x), 2) for x in run(csv, 'bsv | bschema a:f64,... | bsort f64 | bcut 1 | bschema f64:a | csv').splitlines() if x]
44 | 


--------------------------------------------------------------------------------
/test/bsort_i64_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, integers
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut bsort bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     num_columns = draw(integers(min_value=1, max_value=3))
27 |     column = integers(min_value=-9223372036854775806, max_value=9223372036854775806)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line))
30 |     lines = [','.join(map(str, line)) for line in lines]
31 |     return '\n'.join(lines) + '\n'
32 | 
33 | def expected(csv):
34 |     xs = csv.splitlines()
35 |     xs = [int(x.split(',')[0]) for x in xs if x]
36 |     xs = sorted(xs)
37 |     return '\n'.join(map(str, xs)) + '\n'
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(csv):
42 |     result = expected(csv)
43 |     assert result == run(csv, 'bsv | bschema a:i64,... | bsort i64 | bcut 1 | bschema i64:a | csv')
44 | 
45 | 
46 | @given(inputs())
47 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
48 | def test_props_compatability(csv):
49 |     assert run(csv, 'LC_ALL=C sort -n -k1,1 | cut -d, -f1') == run(csv, 'bsv | bschema a:i64,... | bsort i64 | bcut 1 | bschema i64:a | csv')
50 | 


--------------------------------------------------------------------------------
/test/bsort_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import os
  3 | import string
  4 | import shell
  5 | from hypothesis.database import ExampleDatabase
  6 | from hypothesis import given, settings
  7 | from hypothesis.strategies import text, lists, composite, integers
  8 | from test_util import run, rm_whitespace, clone_source
  9 | 
 10 | def setup_module(m):
 11 |     m.tempdir = clone_source()
 12 |     m.orig = os.getcwd()
 13 |     m.path = os.environ['PATH']
 14 |     os.chdir(m.tempdir)
 15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
 16 |     shell.run('make clean && make bsv csv bschema bcut bsort', stream=True)
 17 | 
 18 | def teardown_module(m):
 19 |     os.chdir(m.orig)
 20 |     os.environ['PATH'] = m.path
 21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
 22 |     shell.run('rm -rf', m.tempdir)
 23 | 
 24 | @composite
 25 | def inputs(draw):
 26 |     num_columns = draw(integers(min_value=1, max_value=16))
 27 |     column = text(string.ascii_letters + ':/|', min_size=1, max_size=64)
 28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
 29 |     lines = draw(lists(line))
 30 |     csv = '\n'.join([','.join(x) for x in lines]) + '\n'
 31 |     return csv
 32 | 
 33 | def expected(csv):
 34 |     xs = csv.splitlines()
 35 |     xs = [x.split(',')[0] for x in xs]
 36 |     xs = sorted(xs)
 37 |     return '\n'.join(xs) + '\n'
 38 | 
 39 | @given(inputs())
 40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
 41 | def test_props(csv):
 42 |     result = expected(csv)
 43 |     if result:
 44 |         assert result == run(csv, 'bsv | bsort | bcut 1 | csv')
 45 |     else:
 46 |         with pytest.raises(AssertionError):
 47 |             run(csv, 'bsv | bsort | csv')
 48 | 
 49 | @given(inputs())
 50 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
 51 | def test_props_compatability(csv):
 52 |     assert run(csv, 'LC_ALL=C sort -k1,1 | cut -d, -f1') == run(csv, 'bsv | bsort | bcut 1 | csv')
 53 | 
 54 | def test_basic2():
 55 |     stdin = """
 56 |     a,b
 57 |     aa,a
 58 |     """
 59 |     stdout = """
 60 |     a,b
 61 |     aa,a
 62 |     """
 63 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv')
 64 | 
 65 | def test_basic():
 66 |     stdin = """
 67 |     aa
 68 |     a
 69 |     """
 70 |     stdout = """
 71 |     a
 72 |     aa
 73 |     """
 74 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv')
 75 | 
 76 | def test_compatability():
 77 |     stdin = """
 78 |     c
 79 |     b
 80 |     a
 81 |     """
 82 |     stdout = """
 83 |     a
 84 |     b
 85 |     c
 86 |     """
 87 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv')
 88 | 
 89 | def test_compatability2():
 90 |     stdin = """
 91 |     c,c
 92 |     b,b
 93 |     a,a
 94 |     """
 95 |     stdout = """
 96 |     a,a
 97 |     b,b
 98 |     c,c
 99 |     """
100 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bsort | csv')
101 | 


--------------------------------------------------------------------------------
/test/bsplit_test.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import composite, integers, sampled_from
 7 | from test_util import compile_buffer_sizes, clone_source
 8 | 
 9 | if os.environ.get('TEST_FACTOR'):
10 |     buffers = list(sorted(set([64, 128, 256, 1024, 1024 * 1024 * 5] + [random.randint(64, 1024) for _ in range(10)])))
11 | else:
12 |     buffers = [128]
13 | 
14 | def setup_module(m):
15 |     m.tempdir = clone_source()
16 |     m.orig = os.getcwd()
17 |     m.path = os.environ['PATH']
18 |     os.chdir(m.tempdir)
19 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
20 |     shell.run('make clean', stream=True)
21 |     compile_buffer_sizes('bsv', buffers)
22 |     compile_buffer_sizes('csv', buffers)
23 |     compile_buffer_sizes('bsplit', buffers)
24 |     shell.run('make bsv csv bsplit xxh3 _gen_csv')
25 | 
26 | def teardown_module(m):
27 |     os.chdir(m.orig)
28 |     os.environ['PATH'] = m.path
29 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
30 |     shell.run('rm -rf', m.tempdir)
31 | 
32 | @composite
33 | def inputs(draw):
34 |     buffer = draw(sampled_from(buffers))
35 |     lines = draw(integers(min_value=0, max_value=1024 * 8))
36 |     chunks_per_file = draw(integers(min_value=0, max_value=64))
37 |     return buffer, lines, chunks_per_file
38 | 
39 | @given(inputs())
40 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
41 | def test_props(args):
42 |     buffer, lines, chunks_per_file = args
43 |     if not chunks_per_file:
44 |         chunks_per_file = ''
45 |     with shell.tempdir():
46 |         shell.run(f'_gen_csv 2 {lines} | bsv.{buffer} > data.bsv', echo=True)
47 |         shell.run(f'cat data.bsv | bsplit.{buffer} prefix {chunks_per_file} > filenames')
48 |         assert shell.run(f'cat data.bsv | csv.{buffer} | xxh3') == shell.run(f'cat filenames | while read path; do cat $path; done | csv.{buffer} | xxh3')
49 | 


--------------------------------------------------------------------------------
/test/bsum_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import text, lists, composite, integers
 7 | from test_util import run, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bschema bsum bcut', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     num_columns = draw(integers(min_value=1, max_value=64))
26 |     column = text(string.digits, min_size=1, max_size=16)
27 |     line = lists(column, min_size=num_columns, max_size=num_columns)
28 |     lines = draw(lists(line))
29 |     csv = '\n'.join([','.join(x) for x in lines]) + '\n'
30 |     return csv
31 | 
32 | def expected(csv):
33 |     val = 0
34 |     for line in csv.splitlines():
35 |         col = line.split(',')[0]
36 |         if col:
37 |             val += int(col)
38 |     return val
39 | 
40 | @given(inputs())
41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
42 | def test_props(args):
43 |     csv = args
44 |     result = expected(csv)
45 |     assert result == int(run(csv, 'bsv | bschema a:i64,... | bsum i64 | bcut 1 | bschema i64:a | csv'))
46 | 
47 | def test1():
48 |     stdin = """
49 |     1
50 |     1
51 |     1
52 |     """
53 |     assert '3' == shell.run('bsv | bschema a:i64 | bsum i64 | bschema i64:a | csv', stdin=stdin)
54 | 


--------------------------------------------------------------------------------
/test/bsumeach_f64_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 4 | 
 5 | def setup_module(m):
 6 |     m.tempdir = clone_source()
 7 |     m.orig = os.getcwd()
 8 |     m.path = os.environ['PATH']
 9 |     os.chdir(m.tempdir)
10 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
11 |     shell.run('make clean && make bsv csv bschema bsumeach', stream=True)
12 | 
13 | def teardown_module(m):
14 |     os.chdir(m.orig)
15 |     os.environ['PATH'] = m.path
16 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
17 |     shell.run('rm -rf', m.tempdir)
18 | 
19 | def test_basic():
20 |     stdin = """
21 |     a,1.1
22 |     a,2.1
23 |     a,3.1
24 |     b,4.1
25 |     b,5.1
26 |     a,6.1
27 |     """
28 |     stdout = """
29 |     a,6.3
30 |     b,9.2
31 |     a,6.1
32 |     """
33 |     result = run(rm_whitespace(stdin), 'bsv | bschema *,a:f64 | bsumeach f64 | bschema *,f64:a | csv')
34 |     result = '\n'.join(f'{k},{round(float(v), 3)}' for line in result.splitlines() for k, v in [line.split(',')]) + '\n'
35 |     assert rm_whitespace(stdout) + '\n' == result
36 | 


--------------------------------------------------------------------------------
/test/bsumeach_hash_i64_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 4 | 
 5 | def setup_module(m):
 6 |     m.tempdir = clone_source()
 7 |     m.orig = os.getcwd()
 8 |     m.path = os.environ['PATH']
 9 |     os.chdir(m.tempdir)
10 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
11 |     shell.run('make clean && make bsv csv bsort bschema bsumeach-hash', stream=True)
12 | 
13 | def teardown_module(m):
14 |     os.chdir(m.orig)
15 |     os.environ['PATH'] = m.path
16 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
17 |     shell.run('rm -rf', m.tempdir)
18 | 
19 | def test_basic():
20 |     stdin = """
21 |     a,1
22 |     a,2
23 |     a,3
24 |     b,4
25 |     b,5
26 |     a,6
27 |     """
28 |     stdout = """
29 |     a,12
30 |     b,9
31 |     """
32 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bschema *,a:i64 | bsumeach-hash i64 | bschema *,i64:a | bsort | csv')
33 | 


--------------------------------------------------------------------------------
/test/bsumeach_hash_test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nathants/bsv/1e8924d6e169b117138731cb90eafc8c626bea47/test/bsumeach_hash_test.py


--------------------------------------------------------------------------------
/test/bsumeach_i64_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | from test_util import run, rm_whitespace, rm_whitespace, clone_source
 4 | 
 5 | def setup_module(m):
 6 |     m.tempdir = clone_source()
 7 |     m.orig = os.getcwd()
 8 |     m.path = os.environ['PATH']
 9 |     os.chdir(m.tempdir)
10 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
11 |     shell.run('make clean && make bsv csv bschema bsumeach', stream=True)
12 | 
13 | def teardown_module(m):
14 |     os.chdir(m.orig)
15 |     os.environ['PATH'] = m.path
16 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
17 |     shell.run('rm -rf', m.tempdir)
18 | 
19 | def test_basic():
20 |     stdin = """
21 |     a,1
22 |     a,2
23 |     a,3
24 |     b,4
25 |     b,5
26 |     a,6
27 |     """
28 |     stdout = """
29 |     a,6
30 |     b,9
31 |     a,6
32 |     """
33 |     assert rm_whitespace(stdout) + '\n' == run(rm_whitespace(stdin), 'bsv | bschema *,a:i64 | bsumeach i64 | bschema *,i64:a | csv')
34 | 


--------------------------------------------------------------------------------
/test/btake_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text
 7 | from test_util import run, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv btake', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     r = draw(randoms())
26 |     num_text_columns = draw(integers(min_value=1, max_value=4))
27 |     text_column = text(string.ascii_lowercase, min_size=1, max_size=8)
28 |     text_line = lists(text_column, min_size=num_text_columns, max_size=num_text_columns)
29 |     lines = draw(lists(text_line, min_size=1))
30 |     first_column_values = [line[0] for line in lines]
31 |     threshold = draw(floats(min_value=0, max_value=1))
32 |     for line in lines:
33 |         if line and r.random() > threshold:
34 |             line[0] = r.choice(first_column_values)
35 |     csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n'
36 |     value = r.choice(first_column_values)
37 |     return value, csv
38 | 
39 | def parse(value):
40 |     if value.isdigit():
41 |         value = int(value)
42 |     return value
43 | 
44 | def expected(value, csv):
45 |     value = parse(value)
46 |     res = []
47 |     for line in csv.splitlines():
48 |         columns = line.split(',')
49 |         if columns and parse(columns[0]) != value:
50 |             break
51 |         res.append(','.join(str(parse(x)) for x in columns))
52 |     return '\n'.join(res) + '\n'
53 | 
54 | @given(inputs())
55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
56 | def test_props(args):
57 |     value, csv = args
58 |     result = expected(value, csv)
59 |     assert result == run(csv, f'bsv | btake "{value}" | csv')
60 | 


--------------------------------------------------------------------------------
/test/btakeuntil_i64_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shell
 3 | import string
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import lists, composite, integers, randoms, floats, text
 7 | from test_util import run, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bschema bsort btakeuntil', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     r = draw(randoms())
26 |     num_columns = draw(integers(min_value=1, max_value=4))
27 |     column = integers(min_value=-9223372036854775806, max_value=9223372036854775806)
28 |     line = lists(column, min_size=num_columns, max_size=num_columns)
29 |     lines = draw(lists(line, min_size=1))
30 |     lines = [[str(x) for x in line] for line in lines]
31 |     first_column_values = [line[0] for line in lines]
32 |     threshold = draw(floats(min_value=0, max_value=1))
33 |     for line in lines:
34 |         if line and r.random() > threshold:
35 |             line[0] = r.choice(first_column_values)
36 |     csv = '\n'.join([','.join(l) for l in lines if l]).strip() + '\n'
37 |     value = r.choice(first_column_values)
38 |     return value, csv
39 | 
40 | def expected(value, csv):
41 |     value = int(value)
42 |     res = []
43 |     lines = csv.splitlines()
44 |     lines = [[int(x) for x in line.split(',')] for line in lines]
45 |     lines = sorted(lines)
46 |     print(value, lines)
47 |     for cols in lines:
48 |         if cols:
49 |             if cols[0] >= value:
50 |                 break
51 |             res.append(str(cols[0]))
52 |     return '\n'.join(res) + '\n'
53 | 
54 | @given(inputs())
55 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
56 | def test_props(args):
57 |     value, csv = args
58 |     result = expected(value, csv)
59 |     assert result.splitlines() == run(csv, f'bsv | bschema a:i64,... | bsort i64 | btakeuntil "{value}" i64 | bschema i64:a | csv').splitlines()
60 | 


--------------------------------------------------------------------------------
/test/btopn_i64_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, integers
 8 | from test_util import run, rm_whitespace, clone_source
 9 | 
10 | def setup_module(m):
11 |     m.tempdir = clone_source()
12 |     m.orig = os.getcwd()
13 |     m.path = os.environ['PATH']
14 |     os.chdir(m.tempdir)
15 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
16 |     shell.run('make clean && make bsv csv bcut btopn bschema', stream=True)
17 | 
18 | def teardown_module(m):
19 |     os.chdir(m.orig)
20 |     os.environ['PATH'] = m.path
21 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
22 |     shell.run('rm -rf', m.tempdir)
23 | 
24 | @composite
25 | def inputs(draw):
26 |     n = draw(integers(min_value=1, max_value=16))
27 |     num_columns = draw(integers(min_value=1, max_value=3))
28 |     column = integers(min_value=-9223372036854775806, max_value=9223372036854775806)
29 |     line = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(line))
31 |     lines = [','.join(map(str, line)) for line in lines]
32 |     return n, '\n'.join(lines) + '\n'
33 | 
34 | def expected(n, csv):
35 |     xs = csv.splitlines()
36 |     xs = [int(x.split(',')[0]) for x in xs if x]
37 |     xs = sorted(xs, reverse=True)[:n]
38 |     return '\n'.join(map(str, xs)) + '\n'
39 | 
40 | @given(inputs())
41 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
42 | def test_props(args):
43 |     n, csv = args
44 |     result = expected(n, csv)
45 |     assert result == run(csv, f'bsv | bschema a:i64,... | btopn {n} i64 | bschema i64:a | csv ')
46 | 


--------------------------------------------------------------------------------
/test/btopn_test.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import text, lists, composite, integers, integers, sampled_from
 8 | from test_util import run, clone_source, compile_buffer_sizes
 9 | 
10 | if os.environ.get('TEST_FACTOR'):
11 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
12 | else:
13 |     buffers = [128]
14 | 
15 | def setup_module(m):
16 |     m.tempdir = clone_source()
17 |     m.orig = os.getcwd()
18 |     m.path = os.environ['PATH']
19 |     os.chdir(m.tempdir)
20 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
21 |     shell.run('make clean', stream=True)
22 |     compile_buffer_sizes('csv', buffers)
23 |     compile_buffer_sizes('bsv', buffers)
24 |     compile_buffer_sizes('bcut', buffers)
25 |     compile_buffer_sizes('btopn', buffers)
26 |     compile_buffer_sizes('bschema', buffers)
27 |     shell.run('make bsv csv bcut btopn bschema', stream=True)
28 | 
29 | def teardown_module(m):
30 |     os.chdir(m.orig)
31 |     os.environ['PATH'] = m.path
32 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
33 |     shell.run('rm -rf', m.tempdir)
34 | 
35 | @composite
36 | def inputs(draw):
37 |     buffer = draw(sampled_from(buffers))
38 |     n = draw(integers(min_value=1, max_value=16))
39 |     num_columns = draw(integers(min_value=1, max_value=3))
40 |     column = text(string.ascii_lowercase, min_size=1, max_size=20)
41 |     line = lists(column, min_size=num_columns, max_size=num_columns)
42 |     lines = draw(lists(line))
43 |     lines = [','.join(map(str, line)) for line in lines]
44 |     return buffer, n, '\n'.join(lines) + '\n'
45 | 
46 | def expected(n, csv):
47 |     xs = csv.splitlines()
48 |     xs = [x.split(',')[0] for x in xs if x]
49 |     xs = sorted(xs, reverse=True)[:n]
50 |     return '\n'.join(xs) + '\n'
51 | 
52 | @given(inputs())
53 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
54 | def test_props(args):
55 |     buffer, n, csv = args
56 |     result = expected(n, csv)
57 |     assert result == run(csv, f'bsv.{buffer} | btopn.{buffer} {n} | bcut.{buffer} 1 | csv.{buffer}')
58 | 


--------------------------------------------------------------------------------
/test/bunzip_lz4_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | import shell
 4 | from hypothesis.database import ExampleDatabase
 5 | from hypothesis import given, settings
 6 | from hypothesis.strategies import lists, composite, integers, text
 7 | from test_util import run, clone_source
 8 | 
 9 | def setup_module(m):
10 |     m.tempdir = clone_source()
11 |     m.orig = os.getcwd()
12 |     m.path = os.environ['PATH']
13 |     os.chdir(m.tempdir)
14 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
15 |     shell.run('make clean && make bsv csv bunzip blz4d', stream=True)
16 | 
17 | def teardown_module(m):
18 |     os.chdir(m.orig)
19 |     os.environ['PATH'] = m.path
20 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
21 |     shell.run('rm -rf', m.tempdir)
22 | 
23 | @composite
24 | def inputs(draw):
25 |     num_columns = draw(integers(min_value=1, max_value=12))
26 |     zipcol = integers(min_value=0, max_value=num_columns - 1)
27 |     zipcols = draw(lists(zipcol, min_size=1, max_size=16))
28 |     column = text(string.ascii_lowercase, min_size=1)
29 |     columns = lists(column, min_size=num_columns, max_size=num_columns)
30 |     lines = draw(lists(columns, min_size=1))
31 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
32 |     return zipcols, csv
33 | 
34 | @given(inputs())
35 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
36 | def test_props(args):
37 |     zipcols, csv = args
38 |     just = max(len(str(zipcol)) for zipcol in zipcols)
39 |     zipcols = [str(i).rjust(just, '0') for i in zipcols]
40 |     for i, column in enumerate(run(csv, 'bsv | bunzip -l prefix').splitlines()):
41 |         result = '\n'.join(row.split(',')[i] for row in csv.splitlines())
42 |         assert result == shell.run(f'< {column} blz4d | csv')
43 | 


--------------------------------------------------------------------------------
/test/bunzip_test.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import os
 3 | import string
 4 | import shell
 5 | from hypothesis.database import ExampleDatabase
 6 | from hypothesis import given, settings
 7 | from hypothesis.strategies import lists, composite, integers, text, sampled_from
 8 | from test_util import run, clone_source, compile_buffer_sizes
 9 | 
10 | if os.environ.get('TEST_FACTOR'):
11 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
12 | else:
13 |     buffers = [128]
14 | 
15 | def setup_module(m):
16 |     m.tempdir = clone_source()
17 |     m.orig = os.getcwd()
18 |     m.path = os.environ['PATH']
19 |     os.chdir(m.tempdir)
20 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
21 |     shell.run('make clean', stream=True)
22 |     compile_buffer_sizes('csv', buffers)
23 |     compile_buffer_sizes('bsv', buffers)
24 |     compile_buffer_sizes('bunzip', buffers)
25 |     compile_buffer_sizes('bcat', buffers)
26 |     shell.run('make bsv csv bcat bunzip', stream=True)
27 | 
28 | def teardown_module(m):
29 |     os.chdir(m.orig)
30 |     os.environ['PATH'] = m.path
31 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
32 |     shell.run('rm -rf', m.tempdir)
33 | 
34 | @composite
35 | def inputs(draw):
36 |     buffer = draw(sampled_from(buffers))
37 |     num_columns = draw(integers(min_value=1, max_value=12))
38 |     zipcol = integers(min_value=0, max_value=num_columns - 1)
39 |     zipcols = draw(lists(zipcol, min_size=1, max_size=16))
40 |     column = text(string.ascii_lowercase, min_size=1, max_size=5)
41 |     columns = lists(column, min_size=num_columns, max_size=num_columns)
42 |     lines = draw(lists(columns, min_size=1))
43 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
44 |     return buffer, zipcols, csv
45 | 
46 | @given(inputs())
47 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
48 | def test_props(args):
49 |     buffer, zipcols, csv = args
50 |     just = max(len(str(zipcol)) for zipcol in zipcols)
51 |     zipcols = [str(i).rjust(just, '0') for i in zipcols]
52 |     for i, column in enumerate(run(csv, f'bsv.{buffer} | bunzip.{buffer} prefix').splitlines()):
53 |         result = '\n'.join(row.split(',')[i] for row in csv.splitlines())
54 |         assert result == shell.run('bcat', column)
55 | 


--------------------------------------------------------------------------------
/test/bzip_lz4_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import uuid
 3 | import os
 4 | import string
 5 | import shell
 6 | import random
 7 | from hypothesis.database import ExampleDatabase
 8 | from hypothesis import given, settings
 9 | from hypothesis.strategies import lists, composite, integers, text, randoms, sampled_from
10 | from test_util import run, clone_source, compile_buffer_sizes
11 | 
12 | if os.environ.get('TEST_FACTOR'):
13 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
14 | else:
15 |     buffers = [128]
16 | 
17 | def setup_module(m):
18 |     m.tempdir = clone_source()
19 |     m.orig = os.getcwd()
20 |     m.path = os.environ['PATH']
21 |     os.chdir(m.tempdir)
22 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
23 |     shell.run('make clean', stream=True)
24 |     compile_buffer_sizes('csv', buffers)
25 |     compile_buffer_sizes('bsv', buffers)
26 |     compile_buffer_sizes('blz4', buffers)
27 |     compile_buffer_sizes('bzip', buffers)
28 |     compile_buffer_sizes('bunzip', buffers)
29 |     shell.run('make bsv csv blz4 bzip bunzip', stream=True)
30 | 
31 | def teardown_module(m):
32 |     os.chdir(m.orig)
33 |     os.environ['PATH'] = m.path
34 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
35 |     shell.run('rm -rf', m.tempdir)
36 | 
37 | @composite
38 | def inputs(draw):
39 |     buffer = draw(sampled_from(buffers))
40 |     rand = draw(randoms())
41 |     num_columns = draw(integers(min_value=1, max_value=12))
42 |     zipcol = integers(min_value=0, max_value=num_columns - 1)
43 |     zipcols = draw(lists(zipcol, min_size=1, max_size=16))
44 |     zipcols = list(set(zipcols))
45 |     rand.shuffle(zipcols)
46 |     column = text(string.ascii_lowercase, min_size=1, max_size=5)
47 |     columns = lists(column, min_size=num_columns, max_size=num_columns)
48 |     lines = draw(lists(columns, min_size=1))
49 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
50 |     return buffer, zipcols, csv
51 | 
52 | def expected(zipcols, csv):
53 |     res = []
54 |     for line in csv.splitlines():
55 |         columns = line.split(',')
56 |         res.append(','.join(columns[zipcol] for zipcol in zipcols))
57 |     return '\n'.join(res) + '\n'
58 | 
59 | @given(inputs())
60 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
61 | def test_props(args):
62 |     buffer, zipcols, csv = args
63 |     result = expected(zipcols, csv)
64 |     cols = ','.join(str(i + 1) for i in zipcols)
65 |     prefix = str(uuid.uuid4())
66 |     assert result == run(csv, f'bsv.{buffer} | bunzip.{buffer} -l {prefix} >/dev/null && ls {prefix}_* | bzip.{buffer} -l {cols} | csv.{buffer}')
67 | 
68 | def test_selection():
69 |     shell.run('echo -e "a\nb\n" | bsv | blz4 > a')
70 |     shell.run('echo -e "1\n2\n" | bsv | blz4 > b')
71 |     assert '1,a\n2,b' == shell.run('echo a b | bzip -l 2,1 | csv')
72 |     assert 'a,1\nb,2' == shell.run('echo a b | bzip -l 1,2 | csv')
73 |     assert 'a\nb' == shell.run('echo a b | bzip -l 1 | csv')
74 |     assert '1\n2' == shell.run('echo a b | bzip -l 2 | csv')
75 |     with pytest.raises(Exception):
76 |         assert '1\n2' == shell.run('echo a b | bzip -l 0 | csv')
77 |     with pytest.raises(Exception):
78 |         assert '1\n2' == shell.run('echo a b | bzip -l 3 | csv')
79 |     with pytest.raises(Exception):
80 |         assert '1\n2' == shell.run('echo a b | bzip -l 1,1 | csv')
81 | 
82 | def test_different_lengths():
83 |     shell.run('echo -e "a\nb\nc\n" | bsv | blz4 > a')
84 |     shell.run('echo -e "a\nb\n" | bsv | blz4 > b')
85 |     with pytest.raises(Exception):
86 |         shell.run('echo a b | bzip -l')
87 | 
88 | def test_more_than_1_column():
89 |     shell.run('echo -e "a\nb\nc\n" | bsv | blz4 > a')
90 |     shell.run('echo -e "a\nb\nc,c\n" | bsv | blz4 > b')
91 |     with pytest.raises(Exception):
92 |         shell.run('echo a b | bzip -l')
93 | 


--------------------------------------------------------------------------------
/test/bzip_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import uuid
 3 | import os
 4 | import string
 5 | import shell
 6 | import random
 7 | from hypothesis.database import ExampleDatabase
 8 | from hypothesis import given, settings
 9 | from hypothesis.strategies import lists, composite, integers, text, randoms, sampled_from
10 | from test_util import run, clone_source, compile_buffer_sizes
11 | 
12 | if os.environ.get('TEST_FACTOR'):
13 |     buffers = list(sorted(set([128, 256, 1024, 1024 * 1024 * 5] + [random.randint(128, 1024) for _ in range(10)])))
14 | else:
15 |     buffers = [128]
16 | 
17 | def setup_module(m):
18 |     m.tempdir = clone_source()
19 |     m.orig = os.getcwd()
20 |     m.path = os.environ['PATH']
21 |     os.chdir(m.tempdir)
22 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
23 |     shell.run('make clean', stream=True)
24 |     compile_buffer_sizes('csv', buffers)
25 |     compile_buffer_sizes('bsv', buffers)
26 |     compile_buffer_sizes('bzip', buffers)
27 |     compile_buffer_sizes('bunzip', buffers)
28 |     shell.run('make bsv csv bzip bunzip', stream=True)
29 | 
30 | 
31 | def teardown_module(m):
32 |     os.chdir(m.orig)
33 |     os.environ['PATH'] = m.path
34 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
35 |     shell.run('rm -rf', m.tempdir)
36 | 
37 | @composite
38 | def inputs(draw):
39 |     buffer = draw(sampled_from(buffers))
40 |     rand = draw(randoms())
41 |     num_columns = draw(integers(min_value=1, max_value=12))
42 |     zipcol = integers(min_value=0, max_value=num_columns - 1)
43 |     zipcols = draw(lists(zipcol, min_size=1, max_size=16))
44 |     zipcols = list(set(zipcols))
45 |     rand.shuffle(zipcols)
46 |     column = text(string.ascii_lowercase, min_size=1, max_size=5)
47 |     columns = lists(column, min_size=num_columns, max_size=num_columns)
48 |     lines = draw(lists(columns, min_size=1))
49 |     csv = '\n'.join([','.join(line) for line in lines]) + '\n'
50 |     return buffer, zipcols, csv
51 | 
52 | def expected(zipcols, csv):
53 |     res = []
54 |     for line in csv.splitlines():
55 |         columns = line.split(',')
56 |         res.append(','.join(columns[zipcol] for zipcol in zipcols))
57 |     return '\n'.join(res) + '\n'
58 | 
59 | @given(inputs())
60 | @settings(database=ExampleDatabase(':memory:'), max_examples=100 * int(os.environ.get('TEST_FACTOR', 1)), deadline=os.environ.get("TEST_DEADLINE", 1000 * 60)) # type: ignore
61 | def test_props(args):
62 |     buffer, zipcols, csv = args
63 |     result = expected(zipcols, csv)
64 |     cols = ','.join(str(i + 1) for i in zipcols)
65 |     prefix = str(uuid.uuid4())
66 |     assert result == run(csv, f'bsv.{buffer} | bunzip.{buffer} {prefix} >/dev/null && ls {prefix}_* | bzip.{buffer} {cols} | csv.{buffer}')
67 | 
68 | def test_selection():
69 |     shell.run('echo -e "a\nb\n" | bsv > a')
70 |     shell.run('echo -e "1\n2\n" | bsv > b')
71 |     assert '1,a\n2,b' == shell.run('echo a b | bzip 2,1 | csv')
72 |     assert 'a,1\nb,2' == shell.run('echo a b | bzip 1,2 | csv')
73 |     assert 'a\nb' == shell.run('echo a b | bzip 1 | csv')
74 |     assert '1\n2' == shell.run('echo a b | bzip 2 | csv')
75 |     with pytest.raises(Exception):
76 |         assert '1\n2' == shell.run('echo a b | bzip 0 | csv')
77 |     with pytest.raises(Exception):
78 |         assert '1\n2' == shell.run('echo a b | bzip 3 | csv')
79 |     with pytest.raises(Exception):
80 |         assert '1\n2' == shell.run('echo a b | bzip 1,1 | csv')
81 | 
82 | def test_different_lengths():
83 |     shell.run('echo -e "a\nb\nc\n" | bsv > a')
84 |     shell.run('echo -e "a\nb\n" | bsv > b')
85 |     with pytest.raises(Exception):
86 |         shell.run('echo a b | bzip')
87 | 
88 | def test_more_than_1_column():
89 |     shell.run('echo -e "a\nb\nc\n" | bsv > a')
90 |     shell.run('echo -e "a\nb\nc,c\n" | bsv > b')
91 |     with pytest.raises(Exception):
92 |         shell.run('echo a b | bzip')
93 | 


--------------------------------------------------------------------------------
/test/csv_test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nathants/bsv/1e8924d6e169b117138731cb90eafc8c626bea47/test/csv_test.py


--------------------------------------------------------------------------------
/test/test_util.py:
--------------------------------------------------------------------------------
 1 | import shell
 2 | import sys
 3 | import uuid
 4 | import os
 5 | 
 6 | with shell.climb_git_root():
 7 |     max_columns = int(shell.run('cat util/util.h | grep "define MAX_COLUMNS"').split()[-1])
 8 | 
 9 | def clone_source():
10 |     with shell.climb_git_root():
11 |         orig = os.getcwd()
12 |         with shell.tempdir(cleanup=False):
13 |             shell.run(f"rsync -avhc {orig}/ . --exclude '.git' --exclude '.tox' --exclude '.backups' --exclude '__pycache__' --exclude '.hypothesis' --exclude '.ccls-cache'")
14 |             shell.run('mkdir .git')
15 |             return os.getcwd()
16 | 
17 | def run(stdin, *args):
18 |     with shell.climb_git_root():
19 |         stdinpath = f'stdin.{uuid.uuid4()}'
20 |         stdoutpath = f'stdout.{uuid.uuid4()}'
21 |         with open(stdinpath, 'w') as f:
22 |             f.write(stdin)
23 |         shell.run(*(('set -o pipefail; cat', stdinpath, '|') + args + ('>', stdoutpath)), stream=True)
24 |         with open(stdoutpath) as f:
25 |             return f.read()
26 | 
27 | def runb(stdin, *args):
28 |     with shell.climb_git_root():
29 |         stdinpath = f'stdin.{uuid.uuid4()}'
30 |         stdoutpath = f'stdout.{uuid.uuid4()}'
31 |         if isinstance(stdin, str):
32 |             with open(stdinpath, 'w') as f:
33 |                 f.write(stdin)
34 |         else:
35 |             with open(stdinpath, 'wb') as f:
36 |                 f.write(stdin)
37 |         shell.run(*(('set -o pipefail; cat', stdinpath, '|') + args + ('>', stdoutpath)), stream=True)
38 |         with open(stdoutpath, 'rb') as f:
39 |             return f.read()
40 | 
41 | def unindent(text):
42 |     return '\n'.join([x.lstrip() for x in text.splitlines()]) + '\n'
43 | 
44 | def rm_whitespace(x):
45 |     return '\n'.join([y.strip().replace(' ', '') for y in x.splitlines() if y.strip()])
46 | 
47 | def compile_buffer_sizes(name, buffers):
48 |     with shell.climb_git_root():
49 |         shell.run('cp -f util/util.h util/util.h.bak')
50 |         try:
51 |             for i in buffers:
52 |                 shell.run(f'cat util/util.h.bak | sed -E "s/#define BUFFER_SIZE .*/#define BUFFER_SIZE {i}/" > util/util.h')
53 |                 print('compile:', name, i, flush=True, file=sys.stderr)
54 |                 shell.run('make', name)
55 |                 shell.run(f'mv -f bin/{name} bin/{name}.{i}')
56 |         finally:
57 |             shell.run('cat util/util.h.bak > util/util.h')
58 |             shell.run('rm -f util/util.h.bak')
59 | 


--------------------------------------------------------------------------------
/test/xxh3_test.py:
--------------------------------------------------------------------------------
 1 | import shell
 2 | import io
 3 | import os
 4 | import xxh3
 5 | from test_util import clone_source
 6 | 
 7 | def setup_module(m):
 8 |     m.tempdir = clone_source()
 9 |     m.orig = os.getcwd()
10 |     m.path = os.environ['PATH']
11 |     os.chdir(m.tempdir)
12 |     os.environ['PATH'] = f'{os.getcwd()}/bin:/usr/bin:/usr/local/bin:/sbin:/usr/sbin:/bin'
13 |     shell.run('make clean && make xxh3', stream=True)
14 | 
15 | def teardown_module(m):
16 |     os.chdir(m.orig)
17 |     os.environ['PATH'] = m.path
18 |     assert m.tempdir.startswith('/tmp/') or m.tempdir.startswith('/private/var/folders/')
19 |     shell.run('rm -rf', m.tempdir)
20 | 
21 | def test_hex():
22 |     assert '079364cbfdf9f4cb' == shell.run('echo abc | xxh3')
23 |     assert '079364cbfdf9f4cb' == xxh3.oneshot_hex('abc\n'.encode())
24 | 
25 | def test_int():
26 |     assert '545890807144117451' == shell.run('echo abc | xxh3 --int')
27 |     assert 545890807144117451 == xxh3.oneshot_int('abc\n'.encode())
28 | 
29 | def test_stream():
30 |     assert {
31 |         'cmd': 'set -eou pipefail; echo abc | xxh3 --stream',
32 |         'exitcode': 0,
33 |         'stderr': '079364cbfdf9f4cb',
34 |         'stdout': 'abc',
35 |     } == shell.run('echo abc | xxh3 --stream', warn=True)
36 |     assert '079364cbfdf9f4cb' == xxh3.stream_hex(io.BytesIO('abc\n'.encode()))
37 |     assert {
38 |         'cmd': 'set -eou pipefail; echo abc | xxh3 --stream --int',
39 |         'exitcode': 0,
40 |         'stderr': '545890807144117451',
41 |         'stdout': 'abc',
42 |     } == shell.run('echo abc | xxh3 --stream --int', warn=True)
43 |     assert 545890807144117451 == xxh3.stream_int(io.BytesIO('abc\n'.encode()))
44 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = python3
 3 | skipsdist = True
 4 | 
 5 | [testenv]
 6 | passenv = *
 7 | whitelist_externals = bash
 8 | commands =
 9 |     bash -xc 'py.test -n auto -vx --tb native --durations 40 test/'
10 | 
11 | deps =
12 |     git+https://github.com/nathants/py-util
13 |     git+https://github.com/nathants/py-shell
14 |     git+https://github.com/nathants/py-pool
15 |     git+https://github.com/nathants/cffi-xxh3
16 |     hypothesis
17 |     pytest
18 |     pytest-xdist
19 |     numpy
20 | 


--------------------------------------------------------------------------------
/util/array.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define ARRAY_EXPAND_CAPACITY 1024 * 512
 4 | 
 5 | #define ARRAY_INIT(array, type)                        \
 6 |     u64 array##_size = 0;                            \
 7 |     u64 array##_capacity = ARRAY_EXPAND_CAPACITY;    \
 8 |     type *array;                                    \
 9 |     MALLOC(array, sizeof(type) * array##_capacity);
10 | 
11 | #define ARRAY_ADD(array, size, type)                            \
12 |     do {                                                        \
13 |         if (array##_size + size > array##_capacity) {           \
14 |             array##_capacity += ARRAY_EXPAND_CAPACITY;          \
15 |             REALLOC(array, sizeof(type) * array##_capacity);    \
16 |         }                                                       \
17 |         array##_size += size;                                   \
18 |     } while(0)
19 | 
20 | #define ARRAY_APPEND(array, val, type)                          \
21 |     do {                                                        \
22 |         if (array##_size == array##_capacity) {                 \
23 |             array##_capacity += ARRAY_EXPAND_CAPACITY;          \
24 |             REALLOC(array, sizeof(type) * array##_capacity);    \
25 |         }                                                       \
26 |         array[array##_size++] = val;                            \
27 |     } while(0)
28 | 
29 | #define ARRAY_POP(array, dst)                   \
30 |     do {                                        \
31 |         if (array##_size) {                     \
32 |             dst = array[--array##_size];        \
33 |         } else {                                \
34 |             dst = NULL;                         \
35 |         }                                       \
36 |     } while(0)
37 | 
38 | #define ARRAY_RESET(array)                      \
39 |     do {                                        \
40 |         array##_size = 0;                       \
41 |     } while(0)
42 | 
43 | #define ARRAY_SIZE(array)                       \
44 |     array##_size
45 | 


--------------------------------------------------------------------------------
/util/dump.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "row.h"
 4 | #include "write.h"
 5 | 
 6 | #define ASSERT_SIZE_IS_VALID() ASSERT(row->sizes[i] <= MAX_COLUMNS - 1, "fatal: cannot have columns with more than 2**16 - 1 bytes, column: %d, size: %d, content: %.*s...\n", i, row->sizes[i], 10, row->columns[i])
 7 | #define ASSERT_MAX_IS_VALID()  ASSERT(row->max <= MAX_COLUMNS, "fatal: cannot have more then 2**16 columns\n")
 8 | 
 9 | //
10 | // NOTE: the memory pointed to by row->columns will be copied by
11 | // dump(), and can safely be mutated after the dump() returns.
12 | //
13 | inlined void dump(writebuf_t *wbuf, const row_t *row, i32 file) {
14 |     ASSERT_MAX_IS_VALID();
15 |     i32 size = sizeof(u16) + (row->max + 1) * sizeof(u16); // -------------- init size with max:u16 + size1:u16,...sizen:u16
16 |     for (i32 i = 0; i <= row->max; i++)
17 |         size += row->sizes[i] + 1; // -------------------------------------- update size with column size + \0
18 |     write_start(wbuf, size, file); // -------------------------------------- write start in case total size of writes would exceed the buffer
19 |     write_bytes(wbuf, TO_UINT16(row->max), sizeof(u16), file); // ---------- write row->max
20 |     for (i32 i = 0; i <= row->max; i++) {
21 |         ASSERT_SIZE_IS_VALID();
22 |         write_bytes(wbuf, TO_UINT16(row->sizes[i]), sizeof(u16), file); // - write row->sizes
23 |     }
24 |     for (i32 i = 0; i <= row->max; i++) {
25 |         write_bytes(wbuf, row->columns[i], row->sizes[i], file); // -------- write column
26 |         write_bytes(wbuf, "\0", 1, file); // ------------------------------- add a trailing \0 after every column to make strcmp easier
27 |     }
28 | }
29 | 
30 | inlined void dump_raw(writebuf_t *wbuf, const raw_row_t *raw_row, i32 file) {
31 |     write_start(wbuf, raw_row->header_size + raw_row->buffer_size, file);
32 |     write_bytes(wbuf, raw_row->header, raw_row->header_size, file);
33 |     write_bytes(wbuf, raw_row->buffer, raw_row->buffer_size, file);
34 | }
35 | 
36 | void dump_flush(writebuf_t *wbuf, i32 file) {
37 |     write_flush(wbuf, file);
38 | }
39 | 


--------------------------------------------------------------------------------
/util/load.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "row.h"
 4 | #include "read.h"
 5 | 
 6 | //
 7 | // NOTE: the memory pointed to by row->columns can only be used until
 8 | // the next call of load_next(), which may mutate that memory. if you
 9 | // need it after that you must copy it somewhere else before the next
10 | // call of load_next().
11 | //
12 | // NOTE: you must not mutate memory pointed to by row->columns, which
13 | // should be considered readonly.
14 | //
15 | inlined void load_next(readbuf_t *rbuf, row_t *row, i32 file) {
16 |     read_bytes(rbuf, sizeof(u16), file); // ------------------------------------- read max, the max zero based index into columns data
17 |     switch(rbuf->bytes) {
18 |         case sizeof(u16):
19 |             row->stop = 0;
20 |             row->max = FROM_UINT16(rbuf->buffer); // ---------------------------- parse max
21 |             read_bytes_assert(rbuf, (row->max + 1) * sizeof(u16), file); // ----- read sizes
22 |             i32 size = row->max + 1; // ----------------------------------------- total size in bytes of all columns, including trailing \0
23 |             for (i32 i = 0; i <= row->max; i++) {
24 |                 row->sizes[i] = FROM_UINT16(rbuf->buffer + i * sizeof(u16)); // - parse sizes
25 |                 size += row->sizes[i]; // --------------------------------------- update total size
26 |             }
27 |             read_bytes_assert(rbuf, size * sizeof(u8), file); // ---------------- row all column bytes
28 |             row->columns[0] = rbuf->buffer;
29 |             for (i32 i = 0; i < row->max; i++)
30 |                 row->columns[i + 1] = row->columns[i] + row->sizes[i] + 1; // --- setup pointers to read_buffer and skip trailing \0
31 |             break;
32 |         case 0:
33 |             row->stop = 1; // --------------------------------------------------- empty read means EOF
34 |             break;
35 |         default:
36 |             ASSERT(0, "fatal: row.h read size of row got bad num bytes, this should never happen\n");
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/util/queue.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | 
 5 | typedef struct node_s node_t;
 6 | 
 7 | struct node_s {
 8 |     u8     *val;
 9 |     node_t *next;
10 | };
11 | 
12 | typedef struct queue_s {
13 |     i32 size;
14 |     i32 capacity;
15 |     node_t *head;
16 |     node_t *tail;
17 | } queue_t;
18 | 
19 | queue_t *queue_init(i32 capacity) {
20 |     queue_t *q;
21 |      MALLOC(q, sizeof(*q));
22 |     q->size = 0;
23 |     q->capacity = capacity;
24 |     q->head = NULL;
25 |     q->tail = NULL;
26 |     return q;
27 | }
28 | 
29 | i32 queue_put(queue_t *q, u8 *val) {
30 |     if (q->size == q->capacity)
31 |         return 1;
32 |     node_t *n;
33 |     MALLOC(n, sizeof(*n));
34 |     n->val = val;
35 |     n->next = NULL;
36 |     if (!q->head) {
37 |         q->head = n;
38 |         q->tail = n;
39 |         q->size = 1;
40 |         return 0;
41 |     }
42 |     q->tail->next = n;
43 |     q->tail = n;
44 |     q->size++;
45 |     return 0;
46 | }
47 | 
48 | u8 *queue_get(queue_t *q) {
49 |     if (!q->size)
50 |         return NULL;
51 |     node_t *n = q->head;
52 |     u8 *val = n->val;
53 |     q->head = n->next;
54 |     free(n);
55 |     q->size--;
56 |     return val;
57 | }
58 | 


--------------------------------------------------------------------------------
/util/read.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | #include "lz4.h"
 5 | 
 6 | typedef struct readbuf_s {
 7 |     // public
 8 |     u8 *buffer;
 9 |     i32 bytes;
10 |     // private
11 |     FILE **files;
12 |     u8 **buffers;
13 |     i32 bytes_left;
14 |     i32 bytes_read;
15 |     i32 *offset;
16 |     i32 *chunk_size;
17 |     bool lz4;
18 |     u8 *lz4_buf;
19 |     i32 lz4_size;
20 | } readbuf_t;
21 | 
22 | readbuf_t rbuf_init(FILE **files, i32 num_files, bool lz4) {
23 |     readbuf_t *buf;
24 |     MALLOC(buf, sizeof(readbuf_t));
25 |     buf->files = files;
26 |     MALLOC(buf->buffers, sizeof(u8*) * num_files);
27 |     MALLOC(buf->offset, sizeof(i32) * num_files);
28 |     MALLOC(buf->chunk_size, sizeof(i32) * num_files);
29 |     for (i32 i = 0; i < num_files; i++) {
30 |       buf->chunk_size[i] = BUFFER_SIZE;
31 |       buf->offset[i] = BUFFER_SIZE;
32 |       MALLOC(buf->buffers[i], BUFFER_SIZE);
33 |     }
34 |     buf->lz4 = lz4;
35 |     if (lz4)
36 |         MALLOC(buf->lz4_buf, BUFFER_SIZE_LZ4);
37 |     return *buf;
38 | }
39 | 
40 | #define DECOMPRESS(buf)                                                                                             \
41 |     do {                                                                                                            \
42 |         i32 decompressed_size = LZ4_decompress_safe(buf->lz4_buf, buf->buffers[file], buf->lz4_size, BUFFER_SIZE);  \
43 |         ASSERT(buf->chunk_size[file] == decompressed_size, "fatal: decompress size mismatch\n");                    \
44 |     } while(0)
45 | 
46 | inlined void read_bytes(readbuf_t *buf, i32 size, i32 file) {
47 |     buf->bytes_left = buf->chunk_size[file] - buf->offset[file]; // ------------------------------------ bytes left in the current chunk
48 |     buf->bytes = size;
49 |     ASSERT(buf->bytes_left >= 0, "fatal: negative bytes_left: %d\n", buf->bytes_left);
50 |     if (buf->bytes_left == 0) { // --------------------------------------------------------------------- time to read the next chunk
51 |         buf->bytes_read = fread_unlocked(&buf->chunk_size[file], 1, sizeof(i32), buf->files[file]); // - try read chunk size
52 |         switch (buf->bytes_read) {
53 |             case sizeof(i32): // ----------------------------------------------------------------------- read chunk size succeeded
54 |                 ASSERT(buf->chunk_size[file] <= BUFFER_SIZE, "fatal: bad chunk size: %d\n", buf->chunk_size[file]);
55 |                 #ifdef READ_GROWING // when defined hold all data in ram for sorting
56 |                     MALLOC(buf->buffers[file], buf->chunk_size[file]);
57 |                 #endif
58 |                 if (buf->lz4) {
59 |                     FREAD(&buf->lz4_size, sizeof(i32), buf->files[file]); // --------------------------- read compressed size
60 |                     FREAD(buf->lz4_buf, buf->lz4_size, buf->files[file]); // --------------------------- read compressed chunk
61 |                     DECOMPRESS(buf);
62 |                 } else
63 |                     FREAD(buf->buffers[file], buf->chunk_size[file], buf->files[file]); // ------------- read the chunk body
64 |                 buf->offset[file] = 0; // -------------------------------------------------------------- start at the beggining of the new chunk
65 |                 buf->bytes_left = buf->chunk_size[file]; // -------------------------------------------- bytes_left is the new chunk size
66 |                 ASSERT(size <= buf->bytes_left, "fatal: diskread, not possible, chunk sizes are known\n");
67 |                 break;
68 |             case 0: // --------------------------------------------------------------------------------- read chunk size failed
69 |                 ASSERT(!ferror_unlocked(buf->files[file]), "fatal: read error\n");
70 |                 buf->chunk_size[file] = 0;
71 |                 buf->offset[file] = 0;
72 |                 buf->bytes = 0;
73 |                 break;
74 |             default:
75 |                 ASSERT(0, "fatal: impossible\n");
76 |         }
77 |     } else
78 |         ASSERT(size <= buf->bytes_left, "fatal: ramread, not possible, chunk sizes are known\n");
79 |     buf->buffer = buf->buffers[file] + buf->offset[file]; // ------------------------------------------- update the buffer position for the current read
80 |     buf->offset[file] += buf->bytes; // ---------------------------------------------------------------- update the buffer offset
81 | }
82 | 
83 | inlined void read_bytes_assert(readbuf_t *buf, i32 size, i32 file) {
84 |     read_bytes(buf, size, file);
85 |     ASSERT(buf->bytes == size, "didnt read enough, only got: %d, expected: %d\n", (buf)->bytes, size);
86 | }
87 | 


--------------------------------------------------------------------------------
/util/read_ahead.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "read.h"
 4 | #include "util.h"
 5 | 
 6 | typedef struct readaheadbuf_s {
 7 |     i32 has_nexted;
 8 |     u8 **last_buffers;
 9 |     i32 *last_chunk_size;
10 |     i32 _i32;
11 |     u8 * _u8s;
12 | } readaheadbuf_t;
13 | 
14 | readaheadbuf_t rabuf_init(i32 num_files) {
15 |     readaheadbuf_t *buf;
16 |     MALLOC(buf, sizeof(readaheadbuf_t));
17 |     buf->has_nexted = 0;
18 |     MALLOC(buf->last_buffers, sizeof(u8*) * num_files);
19 |     MALLOC(buf->last_chunk_size, sizeof(i32) * num_files);
20 |     for (i32 i = 0; i < num_files; i++) {
21 |       MALLOC(buf->last_buffers[i], BUFFER_SIZE);
22 |     }
23 |     return *buf;
24 | }
25 | 
26 | inlined void swap(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) {
27 |     // swap buffers
28 |     rabuf->_u8s = rbuf->buffers[file];
29 |     rbuf->buffers[file] = rabuf->last_buffers[file];
30 |     rabuf->last_buffers[file] = rabuf->_u8s;
31 |     // swap chunk sizes
32 |     rabuf->_i32 = rbuf->chunk_size[file];
33 |     rbuf->chunk_size[file] = rabuf->last_chunk_size[file];
34 |     rabuf->last_chunk_size[file] = rabuf->_i32;
35 | }
36 | 
37 | inlined void read_goto_next_chunk(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) {
38 |     swap(rbuf, rabuf, file);
39 |     rbuf->offset[file] = rbuf->chunk_size[file];
40 |     rabuf->has_nexted = 1;
41 | }
42 | 
43 | inlined void read_goto_last_chunk(readbuf_t *rbuf, readaheadbuf_t* rabuf, i32 file) {
44 |     rbuf->offset[file] = 0;
45 |     if (rabuf->has_nexted) {
46 |         // goto_last only does something if goto_next has been used, and results in: buffer = last_buf + current_buf
47 |         swap(rbuf, rabuf, file);
48 |         REALLOC(rbuf->buffers[file], rbuf->chunk_size[file] + rabuf->last_chunk_size[file]);
49 |         memcpy(rbuf->buffers[file] + rbuf->chunk_size[file], rabuf->last_buffers[file], rabuf->last_chunk_size[file]);
50 |         rbuf->chunk_size[file] += rabuf->last_chunk_size[file];
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/util/read_simple.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | 
 5 | typedef struct readbuf_s {
 6 |     // public
 7 |     i32 bytes;
 8 |     u8 *buffer;
 9 |     // private
10 |     i32 *stop;
11 |     i32 *offset;
12 |     FILE **files;
13 |     u8 **buffers;
14 | } readbuf_t;
15 | 
16 | readbuf_t rbuf_init(FILE **files, i32 num_files) {
17 |     readbuf_t *buf;
18 |     MALLOC(buf, sizeof(readbuf_t));
19 |     buf->files = files;
20 |     MALLOC(buf->stop, sizeof(i32) * num_files);
21 |     for (i32 file = 0; file < num_files; file++)
22 |         buf->stop[file] = 0;
23 |     MALLOC(buf->offset, sizeof(i32) * num_files);
24 |     MALLOC(buf->buffers, sizeof(u8*) * num_files);
25 |     for (i32 file = 0; file < num_files; file++) {
26 |         buf->offset[file] = BUFFER_SIZE;
27 |         MALLOC(buf->buffers[file], BUFFER_SIZE);
28 |     }
29 |     return *buf;
30 | }
31 | 
32 | inlined void read_bytes(readbuf_t *buf, i32 size, i32 file) {
33 |     ASSERT(size <= BUFFER_SIZE, "error: cant read more bytes than %d\n", BUFFER_SIZE);
34 |     if (buf->stop[file] == 0) {
35 |         i32 bytes_left = BUFFER_SIZE - buf->offset[file];
36 |         buf->bytes = size;
37 |         if (size > bytes_left) {
38 |             memmove(buf->buffers[file], buf->buffers[file] + buf->offset[file], bytes_left);
39 |             i32 bytes_todo = BUFFER_SIZE - bytes_left;
40 |             i32 bytes = fread_unlocked(buf->buffers[file] + bytes_left, 1, bytes_todo, buf->files[file]);
41 |             buf->offset[file] = 0;
42 |             if (bytes_todo != bytes) {
43 |                 ASSERT(!ferror_unlocked(buf->files[file]), "error: couldnt read input\n");
44 |                 buf->stop[file] = bytes_left + bytes;
45 |                 buf->bytes = MIN(size, bytes + bytes_left);
46 |             }
47 |         }
48 |     } else
49 |         buf->bytes = MIN(size, buf->stop[file] - buf->offset[file]);
50 |     buf->buffer = buf->buffers[file] + buf->offset[file];
51 |     buf->offset[file] += buf->bytes;
52 | }
53 | 


--------------------------------------------------------------------------------
/util/row.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | 
 5 | typedef struct row_s {
 6 |     i32 stop;
 7 |     i32 max;
 8 |     i32 sizes[MAX_COLUMNS];
 9 |     u8 *columns[MAX_COLUMNS];
10 | } row_t;
11 | 
12 | typedef struct raw_row_s {
13 |     u16 meta;
14 |     u8 *header;
15 |     i32 header_size;
16 |     u8 *buffer;
17 |     i32 buffer_size;
18 | } raw_row_t;
19 | 
20 | inlined void row_to_raw(row_t *row, raw_row_t *raw_row) {
21 |     raw_row->header_size = sizeof(u16) + (row->max + 1) * sizeof(u16);
22 |     raw_row->header = row->columns[0] - raw_row->header_size;
23 |     raw_row->buffer = row->columns[0];
24 |     raw_row->buffer_size = 0;
25 |     for (i32 i = 0; i <= row->max; i++)
26 |         raw_row->buffer_size += row->sizes[i] + 1;
27 | }
28 | 
29 | inlined void row_to_raw_malloc(row_t *row, raw_row_t *raw_row) {
30 |     raw_row->header_size = sizeof(u16) + (row->max + 1) * sizeof(u16);
31 |     MALLOC(raw_row->header, raw_row->header_size);
32 |     memcpy(raw_row->header, row->columns[0] - raw_row->header_size, raw_row->header_size);
33 |     raw_row->buffer_size = 0;
34 |     for (i32 i = 0; i <= row->max; i++)
35 |         raw_row->buffer_size += row->sizes[i] + 1;
36 |     MALLOC(raw_row->buffer, raw_row->buffer_size);
37 |     memcpy(raw_row->buffer, row->columns[0], raw_row->buffer_size);
38 | }
39 | 
40 | inlined void raw_row_free(raw_row_t *raw_row) {
41 |     free(raw_row->header);
42 |     free(raw_row->buffer);
43 | }
44 | 


--------------------------------------------------------------------------------
/util/write.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | #include "lz4.h"
 5 | 
 6 | typedef struct writebuf_s {
 7 |     // private
 8 |     FILE **files;
 9 |     u8 **buffer;
10 |     i32 *offset;
11 |     bool lz4;
12 |     u8 *lz4_buf;
13 |     i32 lz4_size;
14 | } writebuf_t;
15 | 
16 | writebuf_t wbuf_init(FILE **files, i32 num_files, bool lz4) {
17 |     writebuf_t *buf;
18 |     MALLOC(buf, sizeof(writebuf_t));
19 |     buf->files = files;
20 |     MALLOC(buf->buffer, sizeof(u8*) * num_files);
21 |     MALLOC(buf->offset, sizeof(i32) * num_files);
22 |     for (i32 i = 0; i < num_files; i++) {
23 |         buf->offset[i] = 0;
24 |         MALLOC(buf->buffer[i], BUFFER_SIZE);
25 |     }
26 |     buf->lz4 = lz4;
27 |     if (lz4)
28 |         MALLOC(buf->lz4_buf, BUFFER_SIZE_LZ4);
29 |     return *buf;
30 | }
31 | 
32 | inlined void write_bytes(writebuf_t *buf, u8 *bytes, i32 size, i32 file) {
33 |         memcpy(buf->buffer[file] + buf->offset[file], bytes, size);
34 |         buf->offset[file] += size;
35 | }
36 | 
37 | #define COMPRESS(buf)                                                                                           \
38 |     LZ4_compress_fast(buf->buffer[file], buf->lz4_buf, buf->offset[file], BUFFER_SIZE_LZ4, LZ4_ACCELERATION)
39 | 
40 | inlined void write_flush(writebuf_t *buf, i32 file) {
41 |     if (buf->offset[file]) { // ------------------------------------------------ flush with an empty buffer is a nop
42 |         FWRITE(&buf->offset[file], sizeof(i32), buf->files[file]); // ---------- write chunk size
43 |         if (buf->lz4) {
44 |             i32 lz4_size = COMPRESS(buf); // ----------------------------------- compress chunk
45 |             FWRITE(&lz4_size, sizeof(i32), buf->files[file]);          // ------ write compressed size
46 |             FWRITE(buf->lz4_buf, lz4_size, buf->files[file]);          // ------ write compressed chunk
47 |         } else
48 |             FWRITE(buf->buffer[file], buf->offset[file], buf->files[file]); // - write chunk
49 |         buf->offset[file] = 0; // ---------------------------------------------- reset the buffer to prepare for the next write
50 |     }
51 | }
52 | 
53 | inlined void write_start(writebuf_t *buf, i32 size, i32 file) {
54 |   ASSERT(size <= BUFFER_SIZE, "fatal: cant write larger than BUFFER_SIZE\n");
55 |   if (size > BUFFER_SIZE - buf->offset[file])
56 |       write_flush(buf, file);
57 | }
58 | 


--------------------------------------------------------------------------------
/util/write_simple.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "util.h"
 4 | 
 5 | typedef struct writebuf_s {
 6 |     // private
 7 |     FILE **files;
 8 |     u8 **buffer;
 9 |     i32 *offset;
10 | } writebuf_t;
11 | 
12 | 
13 | writebuf_t wbuf_init(FILE **files, i32 num_files) {
14 |     writebuf_t *buf;
15 |     MALLOC(buf, sizeof(writebuf_t));
16 |     buf->files = files;
17 |     MALLOC(buf->buffer, sizeof(u8*) * num_files);
18 |     MALLOC(buf->offset, sizeof(i32) * num_files);
19 |     for (i32 i = 0; i < num_files; i++) {
20 |         buf->offset[i] = 0;
21 |         MALLOC(buf->buffer[i], BUFFER_SIZE);
22 |     }
23 |     return *buf;
24 | }
25 | 
26 | inlined void write_flush(writebuf_t *buf, i32 file) {
27 |     if (buf->offset[file]) {
28 |         FWRITE(buf->buffer[file], buf->offset[file], buf->files[file]);
29 |         buf->offset[file] = 0;
30 |     }
31 | }
32 | 
33 | inlined void write_bytes(writebuf_t *buf, u8 *bytes, i32 size, i32 file) {
34 |     ASSERT(size <= BUFFER_SIZE, "fatal: cant write more than BUFFER_SIZE\n");
35 |     if (size > BUFFER_SIZE - buf->offset[file])
36 |         write_flush(buf, file);
37 |     memcpy(buf->buffer[file] + buf->offset[file], bytes, size);
38 |     buf->offset[file] += size;
39 | }
40 | 


--------------------------------------------------------------------------------
/vendor/heap.h:
--------------------------------------------------------------------------------
 1 | // license: mit
 2 | /* from: https://github.com/robin-thomas/min-heap/blob/a1a8d7137f3afdf2b5ebf93b9d4059c4d1dd96e8/minHeap.c */
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "util.h"
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | 
10 | #ifndef HEAP_COMPARE
11 | #define HEAP_COMPARE(meta, x, y) compare(meta, x, y) > 0
12 | #endif
13 | 
14 | #define HEAP_LCHILD(x) 2 * x + 1
15 | #define HEAP_RCHILD(x) 2 * x + 2
16 | #define HEAP_PARENT(x) (x - 1) / 2
17 | 
18 | typedef struct heap_s {
19 | 	u16 meta;
20 |     i32 size;
21 |     u8 **nodes;
22 | } heap_t;
23 | 
24 | void heap_swap(u8 **n1, u8 **n2) {
25 |     u8* temp = *n1;
26 |     *n1 = *n2;
27 |     *n2 = temp;
28 | }
29 | 
30 | void heap_heapify(heap_t *h, i32 i) {
31 |     i32 smallest = (HEAP_LCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_LCHILD(i)], h->nodes[i])) ? HEAP_LCHILD(i) : i;
32 |     if(HEAP_RCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_RCHILD(i)], h->nodes[smallest]))
33 |         smallest = HEAP_RCHILD(i);
34 |     if(smallest != i) {
35 | 		heap_swap(&(h->nodes[i]), &(h->nodes[smallest]));
36 |         heap_heapify(h, smallest);
37 |     }
38 | }
39 | 
40 | void heap_insert(heap_t *h, u8 *data) {
41 |     if(h->size)
42 |         h->nodes = realloc(h->nodes, (h->size + 1) * sizeof(u8*));
43 |     else
44 |         h->nodes = malloc(sizeof(u8*));
45 |     i32 i = (h->size)++;
46 |     while(i && HEAP_COMPARE(h->meta, data, h->nodes[HEAP_PARENT(i)])) {
47 |         h->nodes[i] = h->nodes[HEAP_PARENT(i)];
48 |         i = HEAP_PARENT(i);
49 |     }
50 |     h->nodes[i] = data;
51 | }
52 | 
53 | void heap_delete(heap_t *h) {
54 |     if(h->size) {
55 |         h->nodes[0] = h->nodes[--(h->size)];
56 |         h->nodes = realloc(h->nodes, h->size * sizeof(u8*));
57 |         heap_heapify(h, 0);
58 |     }
59 | }
60 | 
61 | void heapify(heap_t *h, i32 i) {
62 |     i32 smallest = (HEAP_LCHILD(i) < h->size && h->nodes[HEAP_LCHILD(i)] < h->nodes[i]) ? HEAP_LCHILD(i) : i;
63 |     if(HEAP_RCHILD(i) < h->size && HEAP_COMPARE(h->meta, h->nodes[HEAP_RCHILD(i)], h->nodes[smallest])) {
64 |         smallest = HEAP_RCHILD(i);
65 |     }
66 |     if(smallest != i) {
67 |         heap_swap(&(h->nodes[i]), &(h->nodes[smallest]));
68 |         heapify(h, smallest);
69 |     }
70 | }
71 | 
72 | void heap_truncate(heap_t *h, i32 size) {
73 | 	if (h->size <= size)
74 | 		return;
75 | 	heap_t h2 = {0};
76 | 	h2.meta = h->meta;
77 | 	for (i32 i = 0; i < size; i++) {
78 | 		heap_insert(&h2, h->nodes[0]);
79 | 		heap_delete(h);
80 | 	}
81 | 	while (h->size) {
82 | 		free(h->nodes[0]);
83 | 		heap_delete(h);
84 | 	}
85 | 	h->nodes = h2.nodes;
86 | 	h->size = size;
87 | }
88 | 
89 | void heap_free(heap_t *h) {
90 |     free(h->nodes);
91 | }
92 | 


--------------------------------------------------------------------------------
/vendor/xxh3.h:
--------------------------------------------------------------------------------
 1 | /* source: https://github.com/Cyan4973/xxHash/blob/a9054f397d7f41bc505638df3853b270eb9e7493/xxh3.h */
 2 | /*
 3 |  * xxHash - Extremely Fast Hash algorithm
 4 |  * Development source file for `xxh3`
 5 |  * Copyright (C) 2019-2020 Yann Collet
 6 |  *
 7 |  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
 8 |  *
 9 |  * Redistribution and use in source and binary forms, with or without
10 |  * modification, are permitted provided that the following conditions are
11 |  * met:
12 |  *
13 |  *    * Redistributions of source code must retain the above copyright
14 |  *      notice, this list of conditions and the following disclaimer.
15 |  *    * Redistributions in binary form must reproduce the above
16 |  *      copyright notice, this list of conditions and the following disclaimer
17 |  *      in the documentation and/or other materials provided with the
18 |  *      distribution.
19 |  *
20 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 |  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |  *
32 |  * You can contact the author at:
33 |  *   - xxHash homepage: https://www.xxhash.com
34 |  *   - xxHash source repository: https://github.com/Cyan4973/xxHash
35 |  */
36 | 
37 | /*
38 |  * Note: This file used to host the source code of XXH3_* variants.
39 |  * during the development period.
40 |  * The source code is now properly integrated within xxhash.h.
41 |  *
42 |  * xxh3.h is no longer useful,
43 |  * but it is still provided for compatibility with source code
44 |  * which used to include it directly.
45 |  *
46 |  * Programs are now highly discouraged to include xxh3.h.
47 |  * Include `xxhash.h` instead, which is the officially supported interface.
48 |  *
49 |  * In the future, xxh3.h will start to generate warnings, then errors,
50 |  * then it will be removed from source package and from include directory.
51 |  */
52 | 
53 | /* Simulate the same impact as including the old xxh3.h source file */
54 | 
55 | #define XXH_INLINE_ALL
56 | #include "xxhash.h"
57 | 


--------------------------------------------------------------------------------