├── .bettercodehub.yml ├── .github └── workflows │ └── c-cpp.yml ├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE.txt ├── Makefile ├── README.md ├── bench ├── csvkit-csvcut.py ├── csvkit-csvgrep.py ├── generate.c ├── generate.h ├── runner.c └── timer.h ├── src ├── .gitignore ├── csv_tokenizer.c ├── csv_tokenizer.h ├── csvawk.c ├── csvcut.c ├── csvgrep.c ├── csvpipe.c ├── csvunpipe.c ├── debug.h └── hints.h └── test ├── csv_tokenizer_counts.c ├── csvawk ├── corners_command ├── corners_input.csv ├── corners_output.csv ├── large_command ├── large_input.csv ├── large_output.csv ├── simple_command ├── simple_input.csv └── simple_output.csv ├── csvcut ├── canada_keep_note_command ├── canada_keep_note_input.csv.xz ├── canada_keep_note_output.csv.xz ├── column_quoted1_command ├── column_quoted1_input.csv ├── column_quoted1_output.csv ├── corners_drop_ab_command ├── corners_drop_ab_input.csv ├── corners_drop_ab_output.csv ├── corners_keep_ab_command ├── corners_keep_ab_input.csv ├── corners_keep_ab_output.csv ├── large_keep_12_command ├── large_keep_12_input.csv ├── large_keep_12_output.csv ├── large_keep_col1_command ├── large_keep_col1_input.csv ├── large_keep_col1_output.csv ├── large_keep_col23_command ├── large_keep_col23_input.csv ├── large_keep_col23_output.csv ├── overlapping_column_names2_command ├── overlapping_column_names2_input.csv ├── overlapping_column_names2_output.csv ├── overlapping_column_names_command ├── overlapping_column_names_input.csv ├── overlapping_column_names_output.csv ├── simple_drop_a_command ├── simple_drop_a_input.csv ├── simple_drop_a_output.csv ├── simple_drop_ab_command ├── simple_drop_ab_input.csv ├── simple_drop_ab_output.csv ├── simple_keep_ab_command ├── simple_keep_ab_input.csv ├── simple_keep_ab_output.csv ├── simple_keep_ae_command ├── simple_keep_ae_input.csv └── simple_keep_ae_output.csv ├── csvgrep ├── char_range_command ├── char_range_input.csv.xz ├── char_range_output.csv.xz ├── empty_cell_command ├── empty_cell_input.csv ├── empty_cell_output.csv ├── integer_range_command ├── integer_range_input.csv.xz ├── integer_range_output.csv.xz ├── not_option-text_command ├── not_option-text_input.csv ├── not_option-text_output.csv ├── not_quoted_cell_command ├── not_quoted_cell_input.csv ├── not_quoted_cell_output.csv ├── one_field_command ├── one_field_input.csv ├── one_field_output.csv ├── option-text_command ├── option-text_input.csv ├── option-text_output.csv ├── option2-text_command ├── option2-text_input.csv ├── option2-text_output.csv ├── overlapping_columns1_command ├── overlapping_columns1_input.csv ├── overlapping_columns1_output.csv ├── overlapping_columns2_command ├── overlapping_columns2_input.csv ├── overlapping_columns2_output.csv ├── quoted_cell_command ├── quoted_cell_input.csv ├── quoted_cell_output.csv ├── two_NOT_field_command ├── two_NOT_field_input.csv ├── two_NOT_field_output.csv ├── two_field_command ├── two_field_input.csv └── two_field_output.csv ├── csvpipe ├── canada_command ├── canada_input.csv.xz ├── canada_output.csv.xz ├── corners_command ├── corners_input.csv ├── corners_output.csv ├── drop_header_command ├── drop_header_input.csv ├── drop_header_output.csv ├── large-fields_command ├── large-fields_input.csv ├── large-fields_output.csv ├── simple_command ├── simple_input.csv └── simple_output.csv ├── csvtokenizercounts ├── canada_command ├── canada_input.csv.xz ├── canada_output.csv.xz ├── corners_command ├── corners_input.csv ├── corners_output.csv ├── large-fields_command ├── large-fields_input.csv ├── large-fields_output.csv ├── quoted_columns_command ├── quoted_columns_input.csv ├── quoted_columns_output.csv ├── simple_command ├── simple_input.csv ├── simple_output.csv ├── simple_overlapping_columns_command ├── simple_overlapping_columns_input.csv └── simple_overlapping_columns_output.csv ├── csvunpipe ├── canada_command ├── canada_input.csv.xz ├── canada_output.csv.xz ├── corners_command ├── corners_input.csv ├── corners_output.csv ├── drop_header_command ├── drop_header_input.csv ├── drop_header_output.csv ├── large-fields_command ├── large-fields_input.csv ├── large-fields_output.csv ├── simple_command ├── simple_input.csv └── simple_output.csv ├── data ├── canada-2011-census.csv.xz ├── corners.csv ├── large-fields.csv ├── quoted_columns.csv ├── simple.csv └── simple_overlapping_columns.csv ├── runtest.sh └── test-sizes.sh /.bettercodehub.yml: -------------------------------------------------------------------------------- 1 | component_depth: 1 2 | languages: 3 | - name: cpp 4 | production: 5 | include: 6 | - src/*.c 7 | - src/*.h 8 | exclude: 9 | - test/* 10 | - bench/* 11 | test: 12 | include: 13 | - test/* 14 | - bench/* 15 | -------------------------------------------------------------------------------- /.github/workflows/c-cpp.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | push: 5 | branches: [ "master", "main" ] 6 | pull_request: 7 | branches: [ "master", "main" ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: compile 17 | run: make 18 | - name: test 19 | run: make test 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bin-old/ 2 | bin/ 3 | *.swp 4 | *.gcda 5 | *.gcno 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bench/deps/awk-csv-parser"] 2 | path = bench/deps/awk-csv-parser 3 | url = https://github.com/geoffroy-aubry/awk-csv-parser.git 4 | [submodule "bench/deps/pcg-c-basic"] 5 | path = bench/deps/pcg-c-basic 6 | url = https://github.com/imneme/pcg-c-basic.git 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | compiler: clang 3 | 4 | addons: 5 | apt: 6 | packages: 7 | - libpcre3-dev 8 | sonarcloud: 9 | organization: "davylandman-github" 10 | 11 | script: 12 | - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then build-wrapper-linux-x86-64 --out-dir output make test test-all-sizes-ci DISABLE_ASSERTS="" COVERAGE=1; fi' 13 | - 'if [ "$TRAVIS_PULL_REQUEST" = "true" ]; then make test test-all-sizes-ci DISABLE_ASSERTS="" COVERAGE=1; fi' 14 | 15 | after_success: 16 | - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then sonar-scanner -Dsonar.sources=. -Dsonar.projectKey="DavyLandman_csvtools" -Dsonar.cfamily.build-wrapper-output=output; fi' 17 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Davy Landman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BUFFER_SIZE=1048576 # 1024K can be overridden with make BUFFER_SIZE=20 2 | LinkFlags= 3 | CFLAGS+=-std=gnu99 -Wall -pedantic -Wextra -DBUFFER_SIZE=$(BUFFER_SIZE) -fno-strict-aliasing 4 | 5 | DISABLE_ASSERTS=-DNDEBUG 6 | ifdef DEBUG # set with `make .. DEBUG=1` 7 | CFLAGS+=-g -DDEBUG 8 | ifdef VERBOSE 9 | CFLAGS+=-DMOREDEBUG 10 | endif 11 | else 12 | CFLAGS+=-O3 $(DISABLE_ASSERTS) 13 | endif 14 | ifdef PERF 15 | CFLAGS+=-lprofiler -g 16 | endif 17 | 18 | DO_COVERAGE="" 19 | ifdef COVERAGE 20 | CFLAGS+=-coverage 21 | DO_COVERAGE="COVERAGE=1" 22 | endif 23 | 24 | 25 | ifndef TEST_SLOW_PATH 26 | UNAME_S := $(shell uname -s) 27 | ifeq ($(UNAME_S),Linux) 28 | CFLAGS += -D_GNU_SOURCE 29 | endif 30 | else 31 | CFLAGS += -D_SLOW_PATH 32 | endif 33 | 34 | CSV_GREP_FILES = src/csvgrep.c src/csv_tokenizer.c 35 | CSV_CUT_FILES = src/csvcut.c src/csv_tokenizer.c 36 | CSV_TOK_TEST_COUNT_FILES = test/csv_tokenizer_counts.c src/csv_tokenizer.c 37 | CSV_PIPE_FILES = src/csvpipe.c 38 | CSV_UNPIPE_FILES = src/csvunpipe.c 39 | CSV_AWK_FILES = src/csvawk.c 40 | BENCH_FILES = bench/runner.c bench/generate.c bench/deps/pcg-c-basic/pcg_basic.c 41 | 42 | .PHONY: all test clean test-csvgrep test-csvcut test-csvpipe test-csvunpipe test-all-sizes test-tokenizer install 43 | 44 | all: bin/csvcut bin/csvgrep bin/csvpipe bin/csvunpipe bin/csvawk bin/csvawk 45 | 46 | # yes, we recompile csv_tokenizer, it keeps the makefile simpler and it allows 47 | # the compiler to do some cross module optimizations :) 48 | 49 | bench: bin/bench 50 | bin/bench: $(BENCH_FILES) bin/ all 51 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(BENCH_FILES) 52 | 53 | bench/deps/pcg-c-basic/pcg_basic.c: 54 | (cd bench/deps/pcg-c-basic/ && git submodule init && git submodule update) 55 | (cd bench/deps/awk-csv-parser/ && git submodule init && git submodule update) 56 | 57 | csvcut: bin/csvcut 58 | bin/csvcut: $(CSV_CUT_FILES) Makefile bin/ 59 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_CUT_FILES) 60 | 61 | csvpipe: bin/csvpipe 62 | bin/csvpipe: $(CSV_PIPE_FILES) Makefile bin/ 63 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_PIPE_FILES) 64 | 65 | csvunpipe: bin/csvunpipe 66 | bin/csvunpipe: $(CSV_UNPIPE_FILES) Makefile bin/ 67 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_UNPIPE_FILES) 68 | 69 | csvawk: bin/csvawk 70 | bin/csvawk: $(CSV_AWK_FILES) Makefile bin/ 71 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_AWK_FILES) 72 | 73 | csvgrep: bin/csvgrep 74 | bin/csvgrep: $(CSV_GREP_FILES) Makefile bin/ 75 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) `pcre-config --cflags` $(CSV_GREP_FILES) `pcre-config --libs` 76 | 77 | bin/csvtokenizercounts: $(CSV_TOK_TEST_COUNT_FILES) Makefile bin/ 78 | $(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_TOK_TEST_COUNT_FILES) 79 | 80 | bin/: 81 | mkdir bin/ 82 | 83 | ifdef SKIP_LARGE_FILES 84 | LARGE_FILES=0 85 | else 86 | LARGE_FILES=1 87 | endif 88 | 89 | test: test-csvgrep test-csvcut test-csvpipe test-csvunpipe test-csvawk test-tokenizer 90 | 91 | test-csvgrep: bin/csvgrep 92 | cd test && ./runtest.sh csvgrep $(LARGE_FILES) $(DO_COVERAGE) 93 | 94 | test-csvcut: bin/csvcut 95 | cd test && ./runtest.sh csvcut $(LARGE_FILES) $(DO_COVERAGE) 96 | 97 | test-csvpipe: bin/csvpipe 98 | cd test && ./runtest.sh csvpipe $(LARGE_FILES) $(DO_COVERAGE) 99 | 100 | test-csvunpipe: bin/csvunpipe 101 | cd test && ./runtest.sh csvunpipe $(LARGE_FILES) $(DO_COVERAGE) 102 | 103 | test-csvawk: bin/csvawk 104 | cd test && ./runtest.sh csvawk $(LARGE_FILES) $(DO_COVERAGE) 105 | 106 | test-tokenizer: bin/csvtokenizercounts 107 | cd test && ./runtest.sh csvtokenizercounts $(LARGE_FILES) $(DO_COVERAGE) 108 | 109 | test-all-sizes: 110 | ./test/test-sizes.sh $(DO_COVERAGE) 111 | 112 | test-all-sizes-ci: 113 | curl -s https://codecov.io/bash > /tmp/codecov.sh 114 | bash /tmp/codecov.sh -x "llvm-cov gcov" 115 | ./test/test-sizes.sh $(DO_COVERAGE) 116 | bash /tmp/codecov.sh -x "llvm-cov gcov" 117 | ./test/test-sizes.sh $(DO_COVERAGE) TEST_SLOW_PATH=1 118 | bash /tmp/codecov.sh -x "llvm-cov gcov" 119 | 120 | 121 | 122 | prefix=/usr/local 123 | 124 | install: all 125 | install -m 0755 bin/csvcut $(prefix)/bin/csvcut 126 | install -m 0755 bin/csvgrep $(prefix)/bin/csvgrep 127 | install -m 0755 bin/csvawk $(prefix)/bin/csvawk 128 | install -m 0755 bin/csvpipe $(prefix)/bin/csvpipe 129 | install -m 0755 bin/csvunpipe $(prefix)/bin/csvunpipe 130 | 131 | clean: 132 | rm -rf bin/* 133 | deep-clean: 134 | rm -rf bin/* 135 | rm -rf *.gc{ov,da,no} 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # csvtools, fast processing of CSV streams 2 | [![Build Status](https://travis-ci.org/DavyLandman/csvtools.svg?branch=master)](https://travis-ci.org/DavyLandman/csvtools) 3 | [![Coverity Scan Build Status](https://img.shields.io/coverity/scan/5024.svg)](https://scan.coverity.com/projects/5024) 4 | [![codecov.io](https://codecov.io/github/DavyLandman/csvtools/coverage.svg?branch=master)](https://codecov.io/github/DavyLandman/csvtools?branch=master) 5 | 6 | 7 | As our data gets bigger, CSV files grow in size. 8 | The CSV format is not exactly pipe-friendly due to embedded newlines and quoted separators. 9 | [onyxfish/csvkit](https://github.com/onyxfish/csvkit) offers a great set of utilties for most tasks you would want to perform on CSV's in a gnu toolset kind of way. 10 | However, it is not fast. For reasonable data sets, this doesn't matter, but for CSVs of more than a few MBs, you start to feel the pain. 11 | 12 | This repository contains gnu-alike tools for parsing [RFC 4180](https://tools.ietf.org/html/rfc4180) CSVs at high speed. 13 | 14 | ## Tools 15 | 16 | - `csvcut` a `cut(1)` equivalent to drop columns from a csv file 17 | - `csvgrep` a `grep(1)` equivalent to match on one or more collumns per row, and only keep the rows matching all or any of the patterns. (it uses PRCE for regular expression goodness) 18 | - `csvawk` a wrapper for `awk(1)` which correctly recognizes rows and cells (even across newlines). This is comparable to [geoffroy-aubry/awk-csv-parser](https://github.com/geoffroy-aubry/awk-csv-parser), except that it also supports embedded newlines. 19 | - `csvpipe` and `csvunpipe` translate the newlines separating rows to `\0` such that `sort -z` and `uniq -z` and other null-terminated-line based tools can be used more correctly. 20 | 21 | ## Performance 22 | 23 | Benchmarking is complicated, the primary goal is to measure only that of interest, by reducing the impact of other factors. Originally csvtools was benchmarked on the [Canada 2011 census](http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-dwnld-tlchrgr.cfm?Lang=E), however, we were primarily measuring the limits of the SSD and the caches around the file system. 24 | 25 | Now we benchmark with a custom tool: [`bench/runner.c`](bench/runner.c). This benchmark first generates an in memory random csv data set (see [`bench/generate.c`](bench/generate.c)), and then pipes this into the applications under test. This at least takes the IO and FS out of the equation. 26 | 27 | we compare `csvtools` with other solutions. Note that these solutions might not correctly handle CSV's. The reported numbers are _median_ MiB/s. 28 | 29 | ### Pure pipe speed 30 | 31 | | command | median speed | 32 | | :-- | --: | 33 | | `cat > /dev/null` | 2042.1 MiB/s | 34 | | `wc -l > /dev/null` | 2149.0 MiB/s | 35 | | `md5sum > /dev/null` | 566.8 MiB/s | 36 | 37 | 38 | ### csvcut 39 | 40 | | scenario | csvkit | cut | sed | csvtools | 41 | | :--- | ---: | ---: | ---: | ---: | 42 | | first column | 8.0 MiB/s | 278.8 MiB/s | 356.9 MiB/s | _644.1 MiB/s_ | 43 | | middle column | 8.1 MiB/s | 280.3 MiB/s | 138.6 MiB/s | _555.8 MiB/s_ | 44 | | last column | 8.0 MiB/s | 280.0 MiB/s | 90.1 MiB/s | _565.0 MiB/s_ | 45 | | two adjoining columns | 7.3 MiB/s | 359 MiB/s | 59.6 MiB/s | _561.6 MiB/s_ | 46 | | two distinct columns | 7.3 MiB/s | 449 MiB/s | 59.8 MiB/s | _480.9 MiB/s_ | 47 | 48 | So even compared to sed or cut, which aren't handling quoted separators correctly, our `csvcut` is much faster. 49 | 50 | ### csvgrep 51 | 52 | | scenario | csvkit | grep | awk | csvtools | 53 | | :--- | ---: | ---: | ---: | ---: | 54 | | first column | 7.6 MiB/s | 347.9 MiB/s | 469.2 MiB/s | _588.0 MiB/s_ | 55 | | middle column | 7.8 MiB/s | 302.8 MiB/s | 379.3 MiB/s | _579.0 MiB/s_ | 56 | | last column | 7.7 MiB/s | 392.7 MiB/s | 341.5 MiB/s | _632.5 MiB/s_ | 57 | | two distinct columns | 9.0 MiB/s | 273.9 MiB/s | 380.0 MiB/s | _569.7 MiB/s_ | 58 | 59 | Faster than grep and awk, this is because the column selection in grep is done with negative character classes multiple times. 60 | 61 | There are off course regular expressions possible where PCRE is slower than grep. 62 | 63 | ### csvawk 64 | 65 | | scenario | awk | awk-csv-parser | csvtools | 66 | | :--- | ---: | ---: | ---: | 67 | | print second column | 428.5 MiB/s | 2.45 MiB/s | _278.5 MiB/s_ | 68 | | sum last column | 350.5 MiB/s | 2.4 MiB/s | _225.9 MiB/s_ | 69 | 70 | Sadly, `csvawk` is slower than pure `awk`. This is caused by the custom record separator (instead of the normal newline). Benchmarking `csvawk` piping to `awk` shows it performs around 800 MiB/s, and if newlines are used as separators, the whole `csvawk` performs around similar to `awk`'s raw performance. 71 | 72 | However, newlines are not valid separators, since they can occur inside quoted fields. For `csvawk` we generate [`\x1E`](https://en.wikipedia.org/wiki/C0_and_C1_control_codes#Field_separators) between records (as per ISO 646), and [`\x1F`](https://en.wikipedia.org/wiki/C0_and_C1_control_codes#Field_separators) between fields in a record. 73 | 74 | The results of the second benchmark differ, since awk doesn't correctly handle nested separators. 75 | 76 | ### Why so fast? 77 | No malloc & memcpy! 78 | 79 | Or as valgrind reports it: 80 | ``` 81 | ==2473== total heap usage: 18 allocs, 18 frees, 210 bytes allocated 82 | ``` 83 | 84 | In the critical path of tokenizing the csv stream and writing it to `stdout`, there are no copies or memory allocations. The programs read into a buffer from `stdin` (or the file passed as last argument), the tokenizer stores offsets (to that buffer) and lenghts in a cell array, and the printer writes from the same buffer, using the offsets and lengths from the cell array. 85 | 86 | ## Instalation 87 | 88 | 1. Clone this repository 89 | 2. Navigate to it 90 | 2. `make install` (or with prefix: `make install prefix=~/.apps/`) 91 | 3. enjoy :) 92 | 93 | ## Future work 94 | 95 | - Decide on issue #4 96 | - Think of better names that don't clash with csvkit? 97 | - More tests 98 | - add option to remove the header 99 | - sort on columns? 100 | 101 | -------------------------------------------------------------------------------- /bench/csvkit-csvcut.py: -------------------------------------------------------------------------------- 1 | ## EASY-INSTALL-ENTRY-SCRIPT: 'csvkit==0.9.0','console_scripts','csvjson' 2 | __requires__ = 'csvkit==0.9.0' 3 | import sys 4 | from pkg_resources import load_entry_point 5 | 6 | if __name__ == '__main__': 7 | sys.exit( 8 | load_entry_point('csvkit==0.9.0', 'console_scripts', 'csvcut')() 9 | ) 10 | -------------------------------------------------------------------------------- /bench/csvkit-csvgrep.py: -------------------------------------------------------------------------------- 1 | ## EASY-INSTALL-ENTRY-SCRIPT: 'csvkit==0.9.0','console_scripts','csvjson' 2 | __requires__ = 'csvkit==0.9.0' 3 | import sys 4 | from pkg_resources import load_entry_point 5 | 6 | if __name__ == '__main__': 7 | sys.exit( 8 | load_entry_point('csvkit==0.9.0', 'console_scripts', 'csvgrep')() 9 | ) 10 | -------------------------------------------------------------------------------- /bench/generate.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "deps/pcg-c-basic/pcg_basic.h" 7 | #include "generate.h" 8 | 9 | 10 | #define MAX(a,b) (((a) > (b)) ? (a) : (b)) 11 | 12 | inline static bool one_every(pcg32_random_t* rng, int one_in) { 13 | return pcg32_random_r(rng) < (UINT32_MAX / one_in); 14 | } 15 | 16 | static double random_float(pcg32_random_t* rng) { 17 | return ldexp(pcg32_random_r(rng), -32); 18 | } 19 | 20 | #define RANDOM_RANGE(rng, a,b) ((a) + pcg32_boundedrand_r((rng), (b) - (a))) 21 | 22 | static char random_alpha(pcg32_random_t* rng) { 23 | if (one_every(rng, 2)) { 24 | return RANDOM_RANGE(rng, 'A', 'Z'); 25 | } 26 | return RANDOM_RANGE(rng, 'a', 'z'); 27 | } 28 | 29 | static char random_numeric(pcg32_random_t* rng) { 30 | return RANDOM_RANGE(rng, '0', '9'); 31 | } 32 | 33 | static char random_alpha_numeric(pcg32_random_t* rng) { 34 | if (one_every(rng, 2)) { 35 | return random_numeric(rng); 36 | } 37 | return random_alpha(rng); 38 | } 39 | 40 | 41 | static size_t random_cell(pcg32_random_t* rng, char* restrict target, const unsigned int columns, const size_t cell_size_max) { 42 | size_t written = 0; 43 | for (unsigned int i = 0; i < columns; i++) { 44 | if (i > 0) { 45 | *target++ =','; 46 | written++; 47 | } 48 | size_t cell_size = pcg32_boundedrand_r(rng, random_float(rng) < 0.2 ? cell_size_max : MAX(1, cell_size_max / 40)); 49 | if (cell_size < 2) { 50 | cell_size = 2; 51 | } 52 | if (one_every(rng, 3)) { 53 | for (size_t c = 0; c < cell_size; c++) { 54 | *target++ = random_numeric(rng); 55 | } 56 | } 57 | else if (!one_every(rng, 10)) { 58 | for (size_t c = 0; c < cell_size; c++) { 59 | *target++ = random_alpha_numeric(rng); 60 | } 61 | } 62 | else { 63 | *target++ = '"'; 64 | written++; 65 | cell_size -= 2; 66 | for (size_t c = 0; c < cell_size; c++) { 67 | *target++ = random_alpha(rng); 68 | if (c + 2 < cell_size) { 69 | if (one_every(rng, 4)) { 70 | *target++ = ' '; 71 | cell_size--; 72 | written++; 73 | } 74 | else if (one_every(rng, 6)) { 75 | *target++ = ','; 76 | cell_size--; 77 | written++; 78 | } 79 | else if (one_every(rng, 100)) { 80 | *target++ ='"'; 81 | *target++ ='"'; 82 | written += 2; 83 | cell_size -= 2; 84 | } 85 | else if (one_every(rng, 1000)) { 86 | *target++ ='\n'; 87 | cell_size--; 88 | written++; 89 | } 90 | } 91 | } 92 | *target++ = '"'; 93 | written++; 94 | } 95 | written += cell_size; 96 | } 97 | *target++ = '\n'; 98 | written++; 99 | return written; 100 | } 101 | 102 | size_t generate_csv(char* restrict buffer, size_t size, size_t* ten_percent, unsigned int seed1, unsigned int seed2, unsigned int columns) { 103 | const size_t original_size = size; 104 | char* restrict current_char = buffer; 105 | for (unsigned int i = 1; i <= columns; i++) { 106 | if (i > 1) { 107 | *current_char++ = ','; 108 | size--; 109 | } 110 | memcpy(current_char, "column", 6); 111 | current_char += 6; 112 | size -= 6; 113 | int len = snprintf(current_char, (CHAR_BIT * sizeof(int) - 1) / 3 + 2, "%d", i); 114 | current_char += len; 115 | size -= len; 116 | } 117 | *current_char++ = '\n'; 118 | size--; 119 | 120 | pcg32_random_t rng; 121 | 122 | pcg32_srandom_r(&rng, seed1, seed2); 123 | 124 | unsigned int cell_large_max = 255; 125 | 126 | while (size > ((cell_large_max + 1) * columns + 1)) { 127 | size_t written = random_cell(&rng, current_char, columns, cell_large_max); 128 | current_char += written; 129 | size -= written; 130 | if (original_size - size < (original_size / 10)) { 131 | *ten_percent = original_size - size; 132 | } 133 | } 134 | size_t written = random_cell(&rng, current_char, columns, 2); 135 | current_char += written; 136 | return current_char - buffer; 137 | } 138 | -------------------------------------------------------------------------------- /bench/generate.h: -------------------------------------------------------------------------------- 1 | #ifndef _GENERATE_H 2 | #define _GENERATE_H 3 | size_t generate_csv(char* restrict buffer, size_t size, size_t* ten_percent, unsigned int seed1, unsigned int seed2, unsigned int columns); 4 | #endif 5 | -------------------------------------------------------------------------------- /bench/runner.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "timer.h" 7 | #include "generate.h" 8 | 9 | static void print_help() { 10 | fprintf(stderr,"usage: bench [OPTIONS]\n"); 11 | fprintf(stderr,"options:\n"); 12 | fprintf(stderr, "-b 200\n"); 13 | fprintf(stderr, "\tbench size in MBs\n"); 14 | fprintf(stderr, "-e 2\n"); 15 | fprintf(stderr, "\tenlarge bench size by repeating it x times\n"); 16 | fprintf(stderr, "-c 6\n"); 17 | fprintf(stderr, "\tcolumns to generate\n"); 18 | fprintf(stderr, "-r 5\n"); 19 | fprintf(stderr, "\tnumber of measure runs\n"); 20 | fprintf(stderr, "-s 42\n"); 21 | fprintf(stderr, "\tseed for random generator\n"); 22 | fprintf(stderr, "-x\n"); 23 | fprintf(stderr, "\tonly run the csvtools (used for comparing new commits)\n"); 24 | fprintf(stderr, "-p\n"); 25 | fprintf(stderr, "\toutput the generated data to stdout\n"); 26 | } 27 | 28 | static void run(const char* restrict command, const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, double* results) { 29 | for (unsigned int r = 0; r < repeats; r++) { 30 | FILE* target = popen(command, "w"); 31 | if (!target) { 32 | fprintf(stderr, "Can't start \"%s\"\n", command); 33 | results[r] = -1; 34 | } 35 | double start = getRealTime(); 36 | for (unsigned int b = 0; b < buffer_copy; b++) { 37 | fwrite(buffer, sizeof(char), buffer_size, target); 38 | fflush(target); 39 | } 40 | if (pclose(target) != 0) { 41 | fprintf(stderr, "\"%s\" had an error.\n", command); 42 | results[r] = -1; 43 | } 44 | double stop = getRealTime(); 45 | results[r] = (stop - start); 46 | } 47 | } 48 | 49 | /* base on source: nneonneo in http://stackoverflow.com/questions/12890008/replacing-character-in-a-string */ 50 | char *replace(const char *s, char ch, const char *repl) { 51 | int count = 0; 52 | for(const char* t=s; *t; t++) 53 | count += (*t == ch); 54 | 55 | size_t rlen = strlen(repl); 56 | char *res = malloc(strlen(s) + (rlen-1)*count + 1); 57 | char *ptr = res; 58 | for(const char* t=s; *t; t++) { 59 | if(*t == ch) { 60 | memcpy(ptr, repl, rlen); 61 | ptr += rlen; 62 | } else { 63 | *ptr++ = *t; 64 | } 65 | } 66 | *ptr = 0; 67 | return res; 68 | } 69 | 70 | int compare_double(const void *d1, const void *d2) { 71 | return ( *(double*)d1 < *(double*)d2) ? 1 : -1 ; 72 | } 73 | 74 | static double median(double* data, size_t elements) { 75 | if (elements % 2 == 1) { 76 | return data[((elements + 1) / 2) - 1]; 77 | } 78 | return (data[((elements + 1) / 2) - 1] + data[((elements + 1) / 2)]) / 2; 79 | } 80 | 81 | static double to_MBps(double n, size_t buffer_size, unsigned int buffer_copy) { 82 | return( (buffer_size * buffer_copy) / n) / (1024*1024); 83 | } 84 | 85 | static void print_run(const char* program, const char* name, const char* restrict command, const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats) { 86 | double* results = calloc(repeats, sizeof(double)); 87 | run(command, buffer, buffer_size, buffer_copy, repeats, results); 88 | qsort(results, repeats, sizeof(double), compare_double); 89 | 90 | char* command_escaped = replace(command, '"', "\"\""); 91 | char* name_escaped = replace(name, '"', "\"\""); 92 | fprintf(stdout, "%s,\"%s\",\"%s\"", program, name_escaped, command_escaped); 93 | fprintf(stdout, ",%f,%f,%f", to_MBps(results[0], buffer_size, buffer_copy), to_MBps(results[repeats - 1], buffer_size, buffer_copy), to_MBps(median(results, repeats), buffer_size, buffer_copy)); 94 | fprintf(stdout, "\n"); 95 | free(command_escaped); 96 | free(name_escaped); 97 | free(results); 98 | } 99 | 100 | 101 | static void csvgrep_csvkit(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 102 | (void)buffer_copy; // not used 103 | fprintf(stderr, "Running csvkit csvgrep\n"); 104 | print_run("csvkit csvgrep", "first column", "python bench/csvkit-csvgrep.py -c column1 -r '.*[a-e]+.*' > /dev/null", buffer, buffer_size, 1, repeats); 105 | 106 | char command[255]; 107 | sprintf(command, "python bench/csvkit-csvgrep.py -c column%u -r '.*[a-e]+.*' > /dev/null", columns / 2); 108 | print_run("csvkit csvgrep", "middle column", command, buffer, buffer_size, 1, repeats); 109 | 110 | sprintf(command, "python bench/csvkit-csvgrep.py -c column%u -r '.*[a-e]+.*' > /dev/null", columns); 111 | print_run("csvkit csvgrep", "last column", command , buffer, buffer_size, 1, repeats); 112 | 113 | sprintf(command, "python bench/csvkit-csvgrep.py -c column%u,column%u -r '.*[a-e]+.*' > /dev/null", columns / 2, columns - 1); 114 | print_run("csvkit csvgrep", "two columns", command , buffer, buffer_size, 1, repeats); 115 | } 116 | 117 | static void csvgrep_awk(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 118 | fprintf(stderr, "Running awk grep\n"); 119 | print_run("awk grep", "first column", "LC_ALL='C' awk -F\",\" '$1 ~ /.*[a-e]+.*/ { print }' > /dev/null", buffer, buffer_size, 1, repeats); 120 | 121 | char command[255]; 122 | sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ { print }' > /dev/null", columns / 2); 123 | print_run("awk grep", "middle column", command, buffer, buffer_size, 1, repeats); 124 | 125 | sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ { print }' > /dev/null", columns); 126 | print_run("awk grep", "last column", command , buffer, buffer_size, buffer_copy, repeats); 127 | 128 | sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ && $%u ~ /.*[F-L]+.*/ { print }' > /dev/null", columns / 2, columns - 1); 129 | print_run("awk grep", "two columns", command , buffer, buffer_size, buffer_copy, repeats); 130 | } 131 | 132 | static void csvgrep_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 133 | fprintf(stderr, "Running csvtools csvgrep\n"); 134 | print_run("csvtools csvgrep", "first column", "bin/csvgrep -p 'column1/[a-e]+/' > /dev/null", buffer, buffer_size, buffer_copy, repeats); 135 | 136 | char command[255]; 137 | sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' > /dev/null", columns / 2); 138 | print_run("csvtools csvgrep", "middle column", command, buffer, buffer_size, buffer_copy, repeats); 139 | 140 | sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' > /dev/null", columns); 141 | print_run("csvtools csvgrep", "last column", command , buffer, buffer_size, buffer_copy, repeats); 142 | 143 | sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' -p 'column%u/[F-L]+/' > /dev/null", columns / 2 , columns - 1); 144 | print_run("csvtools csvgrep", "two columns", command , buffer, buffer_size, buffer_copy, repeats); 145 | } 146 | 147 | static void repeat(char* restrict target, const char* restrict val, const char separator, size_t repeats) { 148 | size_t val_length = strlen(val); 149 | for (unsigned int r = 0; r < repeats; r++) { 150 | if (r > 0) { 151 | *target++ = separator; 152 | } 153 | memcpy(target, val, val_length); 154 | target += val_length; 155 | } 156 | *target = '\0'; 157 | } 158 | 159 | static void csvgrep_gnugrep(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 160 | fprintf(stderr, "Running gnu grep\n"); 161 | 162 | print_run("gnutools grep", "first column", "LC_ALL='C' grep \"^[^,a-e]*[a-e][a-e]*\" > /dev/null", buffer, buffer_size, buffer_copy, repeats); 163 | 164 | char skip_commands[1024]; 165 | repeat(skip_commands, "[^,]*", ',', columns / 2); 166 | char command[255]; 167 | sprintf(command, "LC_ALL='C' grep \"^%s,[^,a-e]*[a-e][a-e]*\" > /dev/null", skip_commands); 168 | print_run("gnutools grep", "middle column", command, buffer, buffer_size, buffer_copy, repeats); 169 | 170 | print_run("gnutools grep", "last column", "LC_ALL='C' grep \"[a-e][a-e]*[^,a-e]*$\" > /dev/null", buffer, buffer_size, buffer_copy, repeats); 171 | 172 | char skip_commands2[1024]; 173 | repeat(skip_commands2, "[^,]*", ',', (columns - 1) - (columns / 2)); 174 | sprintf(command, "LC_ALL='C' grep \"^%s,[^,a-e]*[a-e][a-e]*[^,]*,%s,[^,F-L][F-L][F-L]*\" > /dev/null", skip_commands, skip_commands2); 175 | print_run("gnutools grep", "two columns", command , buffer, buffer_size, buffer_copy, repeats); 176 | } 177 | 178 | 179 | static void csvcut_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 180 | fprintf(stderr, "Running csvtools csvcut\n"); 181 | print_run("csvtools csvcut", "first column", "bin/csvcut -d column1 > /dev/null", buffer, buffer_size, buffer_copy, repeats); 182 | 183 | char command[255]; 184 | sprintf(command, "bin/csvcut -d column%u > /dev/null", columns / 2); 185 | print_run("csvtools csvcut", "middle column", command, buffer, buffer_size, buffer_copy, repeats); 186 | 187 | sprintf(command, "bin/csvcut -d column%u > /dev/null", columns); 188 | print_run("csvtools csvcut", "last column", command, buffer, buffer_size, buffer_copy, repeats); 189 | 190 | sprintf(command, "bin/csvcut -d column%u,column%u > /dev/null", columns - 3,columns - 2); 191 | print_run("csvtools csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats); 192 | 193 | sprintf(command, "bin/csvcut -d column%u,column%u > /dev/null", columns / 2,columns - 1); 194 | print_run("csvtools csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats); 195 | } 196 | 197 | static void csvcut_csvkit(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 198 | (void)buffer_copy; // not used 199 | fprintf(stderr, "Running csvkit csvcut\n"); 200 | print_run("csvkit csvcut", "first column", "python bench/csvkit-csvcut.py -c column1 > /dev/null", buffer, buffer_size, 1, repeats); 201 | 202 | char command[255]; 203 | sprintf(command, "python bench/csvkit-csvcut.py -c column%u > /dev/null", columns / 2); 204 | print_run("csvkit csvcut", "middle column", command, buffer, buffer_size, 1, repeats); 205 | 206 | sprintf(command, "python bench/csvkit-csvcut.py -c column%u > /dev/null", columns); 207 | print_run("csvkit csvcut", "last column", command, buffer, buffer_size, 1, repeats); 208 | 209 | sprintf(command, "python bench/csvkit-csvcut.py -c column%u,column%u > /dev/null", columns - 3,columns - 2); 210 | print_run("csvkit csvcut", "two adjoining column", command, buffer, buffer_size, 1, repeats); 211 | 212 | sprintf(command, "python bench/csvkit-csvcut.py -c column%u,column%u > /dev/null", columns / 2,columns - 1); 213 | print_run("csvkit csvcut", "two distinct column", command, buffer, buffer_size, 1, repeats); 214 | } 215 | 216 | static void csvcut_gnucut(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 217 | fprintf(stderr, "Running gnu cut\n"); 218 | 219 | char args[255]; 220 | args[0] = '\0'; 221 | size_t written = 0; 222 | for (unsigned int col = 2; col <= columns; col++) { 223 | sprintf(args + written, "%u", col); 224 | written = strlen(args); 225 | args[written++] = ','; 226 | } 227 | if (written > 0) { 228 | args[written - 1] = '\0'; 229 | } 230 | 231 | char command[255]; 232 | sprintf(command, "cut -d ',' -f %s > /dev/null", args); 233 | print_run("cut csvcut", "first column", command, buffer, buffer_size, 1, repeats); 234 | 235 | args[0] = '\0'; 236 | written = 0; 237 | for (unsigned int col = 1; col <= columns; col++) { 238 | if (col != columns / 2) { 239 | sprintf(args + written, "%u", col); 240 | written = strlen(args); 241 | args[written++] = ','; 242 | } 243 | } 244 | if (written > 0) { 245 | args[written - 1] = '\0'; 246 | } 247 | sprintf(command, "cut -d ',' -f %s > /dev/null", args); 248 | print_run("cut csvcut", "middle column", command, buffer, buffer_size, 1, repeats); 249 | 250 | args[0] = '\0'; 251 | written = 0; 252 | for (unsigned int col = 1; col <= columns - 1; col++) { 253 | sprintf(args + written, "%u", col); 254 | written = strlen(args); 255 | args[written++] = ','; 256 | } 257 | if (written > 0) { 258 | args[written - 1] = '\0'; 259 | } 260 | sprintf(command, "cut -d ',' -f %s > /dev/null", args); 261 | print_run("cut csvcut", "last column", command, buffer, buffer_size, 1, repeats); 262 | 263 | args[0] = '\0'; 264 | written = 0; 265 | for (unsigned int col = 1; col <= columns; col++) { 266 | if (col == columns - 3 || col == columns - 2) { 267 | sprintf(args + written, "%u", col); 268 | written = strlen(args); 269 | args[written++] = ','; 270 | } 271 | } 272 | if (written > 0) { 273 | args[written - 1] = '\0'; 274 | } 275 | sprintf(command, "cut -d ',' -f %s > /dev/null", args); 276 | print_run("cut csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats); 277 | 278 | args[0] = '\0'; 279 | written = 0; 280 | for (unsigned int col = 1; col <= columns; col++) { 281 | if (col == columns / 2 || col == columns - 1) { 282 | sprintf(args + written, "%u", col); 283 | written = strlen(args); 284 | args[written++] = ','; 285 | args[written] = '\0'; 286 | } 287 | } 288 | if (written > 0) { 289 | args[written - 1] = '\0'; 290 | } 291 | sprintf(command, "cut -d ',' -f %s > /dev/null", args); 292 | print_run("cut csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats); 293 | } 294 | 295 | static void csvcut_sed(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 296 | fprintf(stderr, "Running gnu sed\n"); 297 | 298 | print_run("sed csvcut", "first column", "LC_ALL='C' sed 's/^[^,]*,//' > /dev/null", buffer, buffer_size, buffer_copy, repeats); 299 | 300 | char command[255]; 301 | sprintf(command, "LC_ALL='C' sed 's/[^,]*,//%u' > /dev/null", columns / 2); 302 | print_run("sed csvcut", "middle column", command, buffer, buffer_size, buffer_copy, repeats); 303 | 304 | sprintf(command, "LC_ALL='C' sed 's/[^,]*,//%u' > /dev/null", columns - 1); 305 | print_run("sed csvcut", "last column", command, buffer, buffer_size, buffer_copy, repeats); 306 | 307 | sprintf(command, "LC_ALL='C' sed -e 's/[^,]*,//%u' -e 's/[^,]*,//%u' > /dev/null", columns - 2, columns - 3 ); 308 | print_run("sed csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats); 309 | 310 | sprintf(command, "LC_ALL='C' sed -e 's/[^,]*,//%u' -e 's/[^,]*,//%u' > /dev/null", columns - 1, columns / 2); 311 | print_run("sed csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats); 312 | 313 | } 314 | 315 | 316 | static void csvawk_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 317 | fprintf(stderr, "Running csvtools csvawk\n"); 318 | print_run("csvtools csvawk", "print second columnd", "LC_ALL='C' bin/csvawk '{ print $2; }' > /dev/null", buffer, buffer_size, buffer_copy, repeats); 319 | 320 | char command[255]; 321 | sprintf(command, "LC_ALL='C' bin/csvawk 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns / 2); 322 | print_run("csvtools csvawk", "sum middle column", command, buffer, buffer_size, buffer_copy, repeats); 323 | 324 | sprintf(command, "LC_ALL='C' bin/csvawk 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns); 325 | print_run("csvtools csvawk", "sum last column", command, buffer, buffer_size, buffer_copy, repeats); 326 | } 327 | 328 | static void csvawk_awkraw(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 329 | fprintf(stderr, "Running raw awk\n"); 330 | print_run("raw awk", "print second columnd", "LC_ALL='C' awk -F',' '{ print $2; }' > /dev/null", buffer, buffer_size, buffer_copy, repeats); 331 | 332 | char command[255]; 333 | sprintf(command, "LC_ALL='C' awk -F',' 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns / 2); 334 | print_run("raw awk", "sum middle column", command, buffer, buffer_size, buffer_copy, repeats); 335 | 336 | sprintf(command, "LC_ALL='C' awk -F',' 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns); 337 | print_run("raw awk", "sum last column", command, buffer, buffer_size, buffer_copy, repeats); 338 | } 339 | 340 | static void csvawk_awkcsvparser(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) { 341 | fprintf(stderr, "Running awk csvparser\n"); 342 | print_run("csvparser awk", "print second columnd", "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source '{ csv_parse_record($0, separator, enclosure, csv); print csv[1]; }' > /dev/null", buffer, buffer_size, 1, repeats); 343 | 344 | char command[255]; 345 | sprintf(command, "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source 'BEGIN {s = 0; }{ csv_parse_record($0, separator, enclosure, csv); s += csv[%u]; } END { print s; }' > /dev/null", (columns / 2) - 1); 346 | print_run("csvparser awk", "sum middle column", command, buffer, buffer_size, 1, repeats); 347 | 348 | sprintf(command, "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source 'BEGIN {s = 0; }{ csv_parse_record($0, separator, enclosure, csv); s += csv[%u]; } END { print s; }' > /dev/null", columns - 1); 349 | print_run("csvparser awk", "sum last column", command, buffer, buffer_size, 1, repeats); 350 | } 351 | 352 | 353 | // based on xxhash avalanche 354 | #define PRIME1 2654435761U 355 | #define PRIME2 2246822519U 356 | #define PRIME3 3266489917U 357 | 358 | static unsigned int xxh_mix(unsigned int x, unsigned int seed) { 359 | unsigned int crc = x + seed + PRIME1; 360 | crc ^= crc >> 15; 361 | crc *= PRIME2; 362 | crc ^= crc >> 13; 363 | crc *= PRIME3; 364 | crc ^= crc >> 16; 365 | return crc; 366 | } 367 | 368 | int main(int argc, char** argv) { 369 | size_t bench_size = 200 * 1024 * 1024; 370 | unsigned int columns = 6; 371 | unsigned int repeats = 5; 372 | unsigned int bench_copy = 2; 373 | unsigned int seed1 = xxh_mix(29, 42); 374 | unsigned int seed2 = xxh_mix(13, 11); 375 | bool only_csvtools = false; 376 | bool output_stdout = false; 377 | 378 | char c; 379 | while ((c = getopt (argc, argv, "b:c:r:e:s:xph")) != -1) { 380 | switch (c) { 381 | case 'b': 382 | sscanf(optarg, "%zu", &bench_size); 383 | bench_size *= 1024 * 1024; 384 | break; 385 | case 'c': 386 | sscanf(optarg, "%u", &columns); 387 | break; 388 | case 'r': 389 | sscanf(optarg, "%u", &repeats); 390 | break; 391 | case 'e': 392 | sscanf(optarg, "%u", &bench_copy); 393 | break; 394 | case 's': 395 | sscanf(optarg, "%u", &seed1); 396 | seed1 = xxh_mix(seed1, 42); 397 | break; 398 | case 'x': 399 | only_csvtools = true; 400 | break; 401 | case 'p': 402 | output_stdout = true; 403 | break; 404 | case '?': 405 | case 'h': 406 | default: 407 | print_help(); 408 | exit(1); 409 | break; 410 | } 411 | } 412 | char* buffer = calloc(bench_size, sizeof(char)); 413 | fprintf(stderr, "Preparing data (%zu bytes)\n",bench_size); 414 | size_t data_filled_small; 415 | size_t data_filled = generate_csv(buffer, bench_size, &data_filled_small, seed1, seed2, columns); 416 | fprintf(stderr, "Data ready (%zu bytes)\n",data_filled); 417 | if (output_stdout) { 418 | for (unsigned int b = 0; b < bench_copy; b++) { 419 | fwrite(buffer, sizeof(char), data_filled, stdout); 420 | fflush(stdout); 421 | } 422 | return 0; 423 | } 424 | 425 | fprintf(stdout, "program,name,command,min speed,max speed,median speed"); 426 | fprintf(stdout, "\n"); 427 | 428 | fprintf(stderr, "Running pipe bench fist\n"); 429 | if (!only_csvtools) { 430 | print_run("bench pipe", "cat", "cat > /dev/null", buffer, data_filled, bench_copy, repeats); 431 | print_run("bench pipe", "wc -l", "wc -l > /dev/null", buffer, data_filled, bench_copy, repeats); 432 | print_run("bench pipe", "md5sum", "openssl md5 > /dev/null", buffer, data_filled, bench_copy, repeats); 433 | } 434 | 435 | csvgrep_csvtools(buffer, data_filled, bench_copy, repeats, columns); 436 | if (!only_csvtools) { 437 | csvgrep_csvkit(buffer, data_filled_small, bench_copy, repeats, columns); 438 | csvgrep_awk(buffer, data_filled, bench_copy, repeats, columns); 439 | csvgrep_gnugrep(buffer, data_filled, bench_copy, repeats, columns); 440 | } 441 | 442 | csvcut_csvtools(buffer, data_filled, bench_copy, repeats, columns); 443 | if (!only_csvtools) { 444 | csvcut_csvkit(buffer, data_filled_small, bench_copy, repeats, columns); 445 | csvcut_gnucut(buffer, data_filled, bench_copy, repeats, columns); 446 | csvcut_sed(buffer, data_filled, bench_copy, repeats, columns); 447 | } 448 | 449 | csvawk_csvtools(buffer, data_filled, bench_copy, repeats / 3, columns); 450 | if (!only_csvtools) { 451 | csvawk_awkraw(buffer, data_filled, bench_copy, repeats / 3, columns); 452 | csvawk_awkcsvparser(buffer, data_filled_small, bench_copy, repeats / 3, columns); 453 | } 454 | 455 | 456 | return 0; 457 | } 458 | -------------------------------------------------------------------------------- /bench/timer.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Author: David Robert Nadeau 3 | * Site: http://NadeauSoftware.com/ 4 | * License: Creative Commons Attribution 3.0 Unported License 5 | * http://creativecommons.org/licenses/by/3.0/deed.en_US 6 | */ 7 | #ifndef _TIMER_H 8 | #define _TIMER_H 9 | #if defined(_WIN32) 10 | #include 11 | 12 | #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) 13 | #include /* POSIX flags */ 14 | #include /* clock_gettime(), time() */ 15 | #include /* gethrtime(), gettimeofday() */ 16 | 17 | #if defined(__MACH__) && defined(__APPLE__) 18 | #include 19 | #include 20 | #endif 21 | 22 | #else 23 | #error "Unable to define getRealTime( ) for an unknown OS." 24 | #endif 25 | 26 | 27 | 28 | 29 | 30 | /** 31 | * Returns the real time, in seconds, or -1.0 if an error occurred. 32 | * 33 | * Time is measured since an arbitrary and OS-dependent start time. 34 | * The returned real time is only useful for computing an elapsed time 35 | * between two calls to this function. 36 | */ 37 | double getRealTime( ) 38 | { 39 | #if defined(_WIN32) 40 | FILETIME tm; 41 | ULONGLONG t; 42 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8 43 | /* Windows 8, Windows Server 2012 and later. ---------------- */ 44 | GetSystemTimePreciseAsFileTime( &tm ); 45 | #else 46 | /* Windows 2000 and later. ---------------------------------- */ 47 | GetSystemTimeAsFileTime( &tm ); 48 | #endif 49 | t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime; 50 | return (double)t / 10000000.0; 51 | 52 | #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__))) 53 | /* HP-UX, Solaris. ------------------------------------------ */ 54 | return (double)gethrtime( ) / 1000000000.0; 55 | 56 | #elif defined(__MACH__) && defined(__APPLE__) 57 | /* OSX. ----------------------------------------------------- */ 58 | static double timeConvert = 0.0; 59 | if ( timeConvert == 0.0 ) 60 | { 61 | mach_timebase_info_data_t timeBase; 62 | (void)mach_timebase_info( &timeBase ); 63 | timeConvert = (double)timeBase.numer / 64 | (double)timeBase.denom / 65 | 1000000000.0; 66 | } 67 | return (double)mach_absolute_time( ) * timeConvert; 68 | 69 | #elif defined(_POSIX_VERSION) 70 | /* POSIX. --------------------------------------------------- */ 71 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0) 72 | { 73 | struct timespec ts; 74 | #if defined(CLOCK_MONOTONIC_PRECISE) 75 | /* BSD. --------------------------------------------- */ 76 | const clockid_t id = CLOCK_MONOTONIC_PRECISE; 77 | #elif defined(CLOCK_MONOTONIC_RAW) 78 | /* Linux. ------------------------------------------- */ 79 | const clockid_t id = CLOCK_MONOTONIC_RAW; 80 | #elif defined(CLOCK_HIGHRES) 81 | /* Solaris. ----------------------------------------- */ 82 | const clockid_t id = CLOCK_HIGHRES; 83 | #elif defined(CLOCK_MONOTONIC) 84 | /* AIX, BSD, Linux, POSIX, Solaris. ----------------- */ 85 | const clockid_t id = CLOCK_MONOTONIC; 86 | #elif defined(CLOCK_REALTIME) 87 | /* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */ 88 | const clockid_t id = CLOCK_REALTIME; 89 | #else 90 | const clockid_t id = (clockid_t)-1; /* Unknown. */ 91 | #endif /* CLOCK_* */ 92 | if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 ) 93 | return (double)ts.tv_sec + 94 | (double)ts.tv_nsec / 1000000000.0; 95 | /* Fall thru. */ 96 | } 97 | #endif /* _POSIX_TIMERS */ 98 | 99 | /* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */ 100 | struct timeval tm; 101 | gettimeofday( &tm, NULL ); 102 | return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0; 103 | #else 104 | return -1.0; /* Failed. */ 105 | #endif 106 | } 107 | #endif 108 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /src/csv_tokenizer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "debug.h" 9 | #include "csv_tokenizer.h" 10 | 11 | #if defined(_GNU_SOURCE) && !defined(SLOW_PATH) 12 | #define FAST_GNU_LIBC 13 | #endif 14 | 15 | enum tokenizer_state { 16 | FRESH, 17 | PREV_NEWLINE, 18 | PREV_CELL, 19 | PREV_QUOTE, 20 | IN_QUOTE 21 | }; 22 | 23 | 24 | struct csv_tokenizer { 25 | const char* restrict buffer; 26 | Cell* restrict cells; 27 | Cell const* restrict cells_end; 28 | #ifdef FAST_GNU_LIBC 29 | char scan_mask[4]; 30 | #endif 31 | 32 | unsigned long long records_processed; 33 | 34 | char separator; 35 | 36 | enum tokenizer_state state; 37 | }; 38 | 39 | struct csv_tokenizer* setup_tokenizer(char separator, const char* restrict buffer, Cell* restrict cells, size_t cell_size) { 40 | struct csv_tokenizer* tokenizer = malloc(sizeof(struct csv_tokenizer)); 41 | tokenizer->separator = separator; 42 | tokenizer->buffer = buffer; 43 | tokenizer->cells = cells; 44 | tokenizer->cells_end = cells + cell_size - 2; // two room at the end 45 | assert(tokenizer->cells < tokenizer->cells_end); 46 | #ifdef FAST_GNU_LIBC 47 | tokenizer->scan_mask[0] = '\r'; 48 | tokenizer->scan_mask[1] = '\n'; 49 | tokenizer->scan_mask[2] = separator; 50 | tokenizer->scan_mask[3] = '\0'; 51 | #endif 52 | tokenizer->records_processed = 0; 53 | tokenizer->state = FRESH; 54 | return tokenizer; 55 | } 56 | 57 | void free_tokenizer(struct csv_tokenizer* restrict tokenizer) { 58 | free(tokenizer); 59 | } 60 | 61 | static void print_current_line(const char* restrict current_char,const char* restrict buffer_start, const char* restrict buffer_end) { 62 | const char* restrict start = current_char; 63 | const char* restrict end = current_char; 64 | 65 | // find surround newlines 66 | while (--start > buffer_start && *start != '\n' && *start != '\r'); 67 | start++; 68 | while (++end < buffer_end && *end != '\n' && *end != '\r'); 69 | end--; 70 | 71 | // copy string such that we can put a \0 at the end 72 | size_t line_length = end-start; 73 | char* printable_string = calloc(sizeof(char), line_length + 1); 74 | memcpy(printable_string, start, line_length); 75 | printable_string[line_length] = '\0'; 76 | fprintf(stderr, "Current line: %s\n", printable_string); 77 | free(printable_string); 78 | } 79 | 80 | void prepare_tokenization(struct csv_tokenizer* restrict tokenizer, char* restrict buffer, size_t buffer_read) { 81 | buffer[buffer_read] = '\0'; 82 | buffer[buffer_read + 1] = tokenizer->separator; 83 | buffer[buffer_read + 2] = '\r'; 84 | buffer[buffer_read + 3] = '"'; 85 | } 86 | 87 | void tokenize_cells(struct csv_tokenizer* restrict tokenizer, size_t buffer_offset, size_t buffer_read, size_t* restrict buffer_consumed, size_t* restrict cells_found, bool* restrict last_full) { 88 | const char* restrict current_char = tokenizer->buffer + buffer_offset; 89 | const char* restrict char_end = tokenizer->buffer + buffer_read; 90 | const char* restrict current_start = current_char; 91 | 92 | 93 | #ifndef FAST_GNU_LIBC 94 | assert(CHAR_BIT == 8); 95 | bool cell_delimitor[256]; 96 | memset(cell_delimitor, false, sizeof(bool) * 256); 97 | cell_delimitor[(unsigned char)tokenizer->separator] = true; 98 | cell_delimitor[(unsigned char)'\n'] = true; 99 | cell_delimitor[(unsigned char)'\r'] = true; 100 | #endif 101 | 102 | Cell* restrict cell = tokenizer->cells; 103 | LOG_V("tokenizer-start\t%d %c (%lu)\n", tokenizer->state, *current_char, buffer_offset ); 104 | 105 | *last_full = true; 106 | enum tokenizer_state old_state = tokenizer->state; 107 | tokenizer->state = FRESH; 108 | switch (old_state) { 109 | case PREV_QUOTE: 110 | if (*current_char == '"') { 111 | // escaped quote so we don't have to decrease the first char 112 | goto IN_QUOTE; 113 | } 114 | current_char--; // jump back, since starts with increment 115 | goto AFTER_QUOTE; 116 | 117 | case IN_QUOTE: 118 | current_char--; // jump back, since the loops starts with increment 119 | goto IN_QUOTE; 120 | 121 | case PREV_NEWLINE: 122 | if ((*current_char == '\n' || *current_char == '\r')) { 123 | while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r')); 124 | } 125 | break; 126 | 127 | case PREV_CELL: 128 | current_char--; // jump back, since the loops starts with increment 129 | goto NORMAL_CELL; 130 | 131 | default: 132 | if (*current_char == tokenizer->separator) { 133 | cell->start = current_start; 134 | cell->length = 0; 135 | cell++; 136 | current_char++; 137 | current_start = current_char; 138 | } 139 | } 140 | 141 | while (current_char < char_end) { 142 | if (*current_char == '"') { 143 | IN_QUOTE:; 144 | while(++current_char < char_end) { 145 | #ifdef _GNU_SOURCE 146 | current_char = rawmemchr(current_char, '"'); 147 | #else 148 | current_char = memchr(current_char, '"', char_end - current_char); 149 | #endif 150 | if (current_char == NULL || current_char > char_end) { 151 | // end of stream reached before end of cell found 152 | current_char = char_end; 153 | break; 154 | } 155 | else { 156 | const char* peek = current_char + 1; 157 | assert(peek <= char_end); 158 | if (peek == char_end) { 159 | // at the end of stream and not sure if escaped or not 160 | tokenizer->state = PREV_QUOTE; 161 | *last_full = false; 162 | break; 163 | } 164 | else if (*peek == '"') { 165 | current_char++; 166 | continue; 167 | } 168 | else { 169 | break; 170 | } 171 | } 172 | } 173 | AFTER_QUOTE: 174 | if (current_char != char_end) { 175 | current_char++; 176 | } 177 | cell->start = current_start; 178 | cell->length = (size_t)((current_char)-current_start); 179 | cell++; 180 | 181 | if (current_char == char_end) { 182 | if (*(current_char-1) != '"' || *(current_char-2) == '"' || current_char - 1 == current_start) { 183 | if (tokenizer->state == FRESH) { 184 | tokenizer->state = IN_QUOTE; 185 | } 186 | *last_full = false; 187 | break; 188 | } 189 | *last_full = false; // is this correct? does it ever happen? 190 | break; 191 | } 192 | 193 | if (*current_char == '\n' || *current_char == '\r') { 194 | cell->start = NULL; 195 | cell->length = -1; 196 | cell++; 197 | tokenizer->records_processed++; 198 | // consume newline 199 | while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r')); 200 | if (current_char == char_end) { 201 | // we stopped inside a new_line 202 | tokenizer->state = PREV_NEWLINE; 203 | break; 204 | } 205 | current_start = current_char; 206 | } 207 | else if (*current_char == tokenizer->separator) { 208 | current_char++; 209 | current_start = current_char; 210 | } 211 | else { 212 | fprintf(stderr, "Invalid character: \"%c (\\%d)\" found after end of cell (after the %lluth record)\n",*current_char, *current_char,tokenizer->records_processed); 213 | print_current_line(current_char, tokenizer->buffer, char_end); 214 | exit(1); 215 | return; 216 | } 217 | if (cell >= tokenizer->cells_end) { 218 | break; 219 | } 220 | } 221 | else if (*current_char == tokenizer->separator) { 222 | // an empty cell somewhere in the middle 223 | cell->start = current_start; 224 | cell->length = 0; 225 | cell++; 226 | current_start = ++current_char; 227 | if (cell >= tokenizer->cells_end) { 228 | break; 229 | } 230 | } 231 | else if (*current_char == '\n' || *current_char == '\r') { 232 | // an newline means that we had an empty cell as last cell of the 233 | // row 234 | cell->start = current_start; 235 | cell->length = 0; 236 | cell++; 237 | 238 | cell->start = NULL; 239 | cell->length = -1; 240 | cell++; 241 | tokenizer->records_processed++; 242 | // consume newline 243 | while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r')); 244 | if (current_char == char_end) { 245 | // we stopped inside a new_line 246 | tokenizer->state = PREV_NEWLINE; 247 | break; 248 | } 249 | current_start = current_char; 250 | if (cell >= tokenizer->cells_end) { 251 | break; 252 | } 253 | } 254 | else { 255 | // start of a new field 256 | NORMAL_CELL:; 257 | #ifdef FAST_GNU_LIBC 258 | do { 259 | current_char++; 260 | current_char += strcspn(current_char, tokenizer->scan_mask); 261 | } while (current_char < char_end && *current_char == '\0'); // strspn stops at 0 chars. 262 | #else 263 | while (true) { 264 | if (cell_delimitor[(unsigned char)current_char[1]]) { 265 | current_char += 1; 266 | goto FOUND_CELL_END; 267 | } 268 | if (cell_delimitor[(unsigned char)current_char[2]]) { 269 | current_char += 2; 270 | goto FOUND_CELL_END; 271 | } 272 | if (cell_delimitor[(unsigned char)current_char[3]]) { 273 | current_char += 3; 274 | goto FOUND_CELL_END; 275 | } 276 | current_char += 4; 277 | if (cell_delimitor[(unsigned char)current_char[0]]) { 278 | goto FOUND_CELL_END; 279 | } 280 | } 281 | FOUND_CELL_END:; 282 | #endif 283 | if (current_char > char_end) { 284 | current_char = char_end; 285 | } 286 | cell->start = current_start; 287 | cell->length = (size_t)((current_char)-current_start); 288 | cell++; 289 | 290 | if (current_char == char_end) { 291 | if (*(current_char-1) != tokenizer->separator) { 292 | tokenizer->state = PREV_CELL; 293 | *last_full = false; 294 | break; 295 | } 296 | } 297 | else if (*current_char == '\n' || *current_char == '\r') { 298 | cell->start = NULL; 299 | cell->length = -1; 300 | cell++; 301 | tokenizer->records_processed++; 302 | // consume newline 303 | while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r')); 304 | if (current_char == char_end) { 305 | // we stopped inside a new_line 306 | tokenizer->state = PREV_NEWLINE; 307 | break; 308 | } 309 | current_start = current_char; 310 | } 311 | else if (*current_char == tokenizer->separator) { 312 | current_char++; 313 | current_start = current_char; 314 | } 315 | else { 316 | fprintf(stderr, "Invalid character: \"%c (\\%d)\" found after end of cell (after the %lluth record)\n",*current_char, *current_char,tokenizer->records_processed); 317 | print_current_line(current_char, tokenizer->buffer, char_end); 318 | exit(1); 319 | return; 320 | } 321 | if (cell >= tokenizer->cells_end) { 322 | break; 323 | } 324 | } 325 | } 326 | *buffer_consumed = (size_t)(current_char - tokenizer->buffer); 327 | *cells_found = (size_t)(cell - tokenizer->cells); 328 | 329 | LOG_V("tokenizer-done\t%d, %c (%lu) %d\n", tokenizer->state, *(current_char-1), *buffer_consumed , *last_full); 330 | } 331 | 332 | -------------------------------------------------------------------------------- /src/csv_tokenizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | 5 | // make sure the buffer passed is actually this amount bigger 6 | #define BUFFER_TOKENIZER_POSTFIX 4 7 | 8 | struct csv_tokenizer; 9 | typedef struct { 10 | char const * restrict start; 11 | size_t length; 12 | } Cell; 13 | 14 | void prepare_tokenization(struct csv_tokenizer* restrict tokenizer, char* restrict buffer, size_t buffer_read); 15 | 16 | struct csv_tokenizer* setup_tokenizer(char separator, const char* restrict buffer, Cell* restrict cells, size_t cell_size); 17 | void tokenize_cells(struct csv_tokenizer* restrict tokenizer, size_t buffer_offset, size_t buffer_read, size_t* restrict buffer_consumed, size_t* restrict cells_found, bool* restrict last_full); 18 | void free_tokenizer(struct csv_tokenizer* restrict tokenizer); 19 | -------------------------------------------------------------------------------- /src/csvawk.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "debug.h" 10 | #include "hints.h" 11 | 12 | #define AWK_ROW_SEPARATOR '\x1E' 13 | #define AWK_CELL_SEPARATOR '\x1F' 14 | 15 | #if defined(_GNU_SOURCE) && !defined(SLOW_PATH) 16 | #define FAST_GNU_LIBC 17 | #endif 18 | 19 | static char _buffer[BUFFER_SIZE + 2]; 20 | 21 | static struct { 22 | FILE* source; 23 | char separator; 24 | bool drop_header; 25 | #ifdef FAST_GNU_LIBC 26 | char scan_mask[4]; 27 | #endif 28 | char* script; 29 | FILE* target; 30 | } config; 31 | 32 | static void start_awk(); 33 | static void parse_config(int argc, char** argv); 34 | static void do_pipe(size_t chars_read); 35 | int main(int argc, char** argv) { 36 | parse_config(argc, argv); 37 | 38 | if (config.target != stdout) { 39 | start_awk(); 40 | } 41 | 42 | size_t chars_read; 43 | SEQUENTIAL_HINT(config.source); 44 | while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, config.source)) > 0) { 45 | _buffer[chars_read] = '\0'; 46 | _buffer[chars_read+1] = '\"'; 47 | do_pipe(chars_read); 48 | } 49 | if (config.source != stdin) { 50 | fclose(config.source); 51 | } 52 | if (config.target != stdout) { 53 | pclose(config.target); 54 | } 55 | return 0; 56 | } 57 | 58 | static void print_help() { 59 | fprintf(stderr,"usage: csvawk [OPTIONS] AWKSCRIPT [FILE]"); 60 | fprintf(stderr,"options:"); 61 | fprintf(stderr, "-s ,\n"); 62 | fprintf(stderr, " Which character to use as separator (default is ,)\n"); 63 | fprintf(stderr, "-d\n"); 64 | fprintf(stderr, " drop header row\n"); 65 | } 66 | 67 | static void parse_config(int argc, char** argv) { 68 | config.source = stdin; 69 | config.separator = ','; 70 | config.drop_header = false; 71 | 72 | char c; 73 | while ((c = getopt (argc, argv, "s:dp")) != -1) { 74 | switch (c) { 75 | case 's': 76 | config.separator = optarg[0]; 77 | break; 78 | case 'd': 79 | config.drop_header = true; 80 | break; 81 | case 'p': 82 | config.target = stdout; 83 | break; 84 | case '?': 85 | case 'h': 86 | print_help(); 87 | exit(1); 88 | break; 89 | } 90 | } 91 | int args_left = argc - optind; 92 | switch(args_left) { 93 | case 0: 94 | fprintf(stderr, "Missing AWK script\n"); 95 | print_help(); 96 | exit(1); 97 | break; 98 | case 2: 99 | config.source = fopen(argv[argc - 1], "r"); 100 | if (!config.source) { 101 | fprintf(stderr, "Could not open file %s for reading\n", argv[optind]); 102 | exit(1); 103 | } 104 | // fall through 105 | case 1: 106 | config.script = argv[optind]; 107 | break; 108 | default: 109 | if (args_left > 2) { 110 | fprintf(stderr, "Too many arguments\n"); 111 | } 112 | else { 113 | fprintf(stderr, "Missing AWK script\n"); 114 | } 115 | print_help(); 116 | exit(1); 117 | break; 118 | } 119 | 120 | #ifdef FAST_GNU_LIBC 121 | config.scan_mask[0] = '\r'; 122 | config.scan_mask[1] = '\n'; 123 | config.scan_mask[2] = config.separator; 124 | config.scan_mask[3] = '\"'; 125 | #endif 126 | 127 | LOG_D("%s\n","Done parsing config params"); 128 | } 129 | 130 | enum tokenizer_state { 131 | FRESH, 132 | PREV_NEWLINE, 133 | PREV_QUOTE, 134 | IN_QUOTE, 135 | }; 136 | 137 | static bool first_run = true; 138 | static enum tokenizer_state _state = FRESH; 139 | 140 | static void do_pipe(size_t chars_read) { 141 | char* restrict current_char = _buffer; 142 | char const* restrict char_end = _buffer + chars_read; 143 | char const* restrict current_start = _buffer; 144 | LOG_V("Piping: %zu state: %d first char: %c\n", chars_read, _state, *current_char); 145 | 146 | #ifndef FAST_GNU_LIBC 147 | assert(CHAR_BIT == 8); 148 | bool cell_delimitor[256]; 149 | memset(cell_delimitor, false, sizeof(bool) * 256); 150 | cell_delimitor[(unsigned char)config.separator] = true; 151 | cell_delimitor[(unsigned char)'\n'] = true; 152 | cell_delimitor[(unsigned char)'\r'] = true; 153 | cell_delimitor[(unsigned char)'\"'] = true; 154 | #endif 155 | 156 | if (config.drop_header && first_run) { 157 | while (current_char < char_end) { 158 | if (*current_char == '\n' || *current_char == '\r') { 159 | if (*current_char == '\r') { 160 | _state = PREV_NEWLINE; // handle the windows newlines correctly 161 | } 162 | current_start = ++current_char; 163 | first_run = false; 164 | break; 165 | } 166 | current_char++; 167 | } 168 | if (current_char == char_end) { 169 | return; 170 | } 171 | } 172 | 173 | switch(_state) { 174 | case PREV_QUOTE: 175 | _state = FRESH; // reset state 176 | if (*current_char == '"') { 177 | // we have two quotes 178 | // one in the previous block, one in the current 179 | goto IN_QUOTE; 180 | } 181 | // we were at the end of the quoted cell, so let's continue 182 | break; 183 | case IN_QUOTE: 184 | current_char--; // the loop starts with a increment 185 | goto IN_QUOTE; 186 | case PREV_NEWLINE: 187 | if (*current_char == '\n') { 188 | // we already had a newline, so lets eat this second windows 189 | // newline 190 | current_char++; 191 | current_start++; 192 | } 193 | _state = FRESH; 194 | break; 195 | default: 196 | break; 197 | } 198 | 199 | while (current_char < char_end) { 200 | if (*current_char == '"') { 201 | IN_QUOTE: 202 | while (++current_char < char_end) { 203 | if (*current_char == '"') { 204 | char const* peek = current_char + 1; 205 | if (peek == char_end) { 206 | current_char++; 207 | _state = PREV_QUOTE; 208 | // at the end of stream and not sure if escaped or not 209 | break; 210 | } 211 | else if (*peek == '"') { 212 | current_char++; 213 | continue; 214 | } 215 | else { 216 | break; 217 | } 218 | } 219 | } 220 | if (current_char == char_end) { 221 | // we are at the end, let's write everything we've seen 222 | if (_state != PREV_QUOTE) { 223 | _state = IN_QUOTE; 224 | } 225 | break; 226 | } 227 | else { 228 | current_char++; 229 | _state = FRESH; 230 | } 231 | } 232 | else if (*current_char == '\n') { 233 | *current_char = AWK_ROW_SEPARATOR; 234 | current_char++; 235 | } 236 | else if (*current_char == '\r') { 237 | *current_char = AWK_ROW_SEPARATOR; 238 | current_char++; 239 | if (current_char == char_end) { 240 | _state = PREV_NEWLINE; 241 | break; 242 | } 243 | else if (*current_char == '\n') { 244 | // we have windows new lines, so lets skip over this byte 245 | fwrite(current_start, sizeof(char), current_char - current_start, config.target); 246 | current_char++; 247 | current_start = current_char; 248 | } 249 | } 250 | else if (*current_char == config.separator) { 251 | *current_char = AWK_CELL_SEPARATOR; 252 | current_char++; 253 | } 254 | else { 255 | // all other chars, just skip until we find another interesting character 256 | #ifdef FAST_GNU_LIBC 257 | do { 258 | current_char++; 259 | current_char += strcspn(current_char, config.scan_mask); 260 | } while (current_char < char_end && *current_char == '\0'); // strspn stops at 0 chars. 261 | #else 262 | while (true) { 263 | if (cell_delimitor[(unsigned char)current_char[1]]) { 264 | current_char += 1; 265 | goto FOUND_CELL_END; 266 | } 267 | if (cell_delimitor[(unsigned char)current_char[2]]) { 268 | current_char += 2; 269 | goto FOUND_CELL_END; 270 | } 271 | if (cell_delimitor[(unsigned char)current_char[3]]) { 272 | current_char += 3; 273 | goto FOUND_CELL_END; 274 | } 275 | current_char += 4; 276 | if (cell_delimitor[(unsigned char)current_char[0]]) { 277 | goto FOUND_CELL_END; 278 | } 279 | } 280 | FOUND_CELL_END:; 281 | #endif 282 | while (current_char > char_end) { 283 | // we added a \0 past the end just to detect the end, so let's revert to the actual end 284 | current_char--; 285 | } 286 | } 287 | } 288 | if (current_start < char_end) { 289 | fwrite(current_start, sizeof(char), char_end - current_start, config.target); 290 | } 291 | fflush(config.target); 292 | } 293 | 294 | void start_awk() { 295 | char* prefix = "awk \'BEGIN{ FS=\"\\x1F\"; RS=\"\\x1E\" } "; 296 | char* command = calloc(strlen(prefix) + strlen(config.script) + 2, sizeof(char)); 297 | sprintf(command, "%s %s\'", prefix, config.script); 298 | config.target = popen(command, "w"); 299 | if (!config.target) { 300 | fprintf(stderr, "Can't start \"%s\"\n", command); 301 | exit(1); 302 | } 303 | free(command); 304 | } 305 | 306 | -------------------------------------------------------------------------------- /src/csvcut.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "csv_tokenizer.h" 10 | #include "debug.h" 11 | #include "hints.h" 12 | 13 | //#define BUFFER_SIZE 30 14 | //#define BUFFER_SIZE 72 15 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2 16 | 17 | 18 | struct csv_tokenizer* _tokenizer; 19 | 20 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX]; 21 | static Cell _cells[CELL_BUFFER_SIZE]; 22 | 23 | static struct { 24 | FILE* source; 25 | FILE* target; 26 | 27 | char separator; 28 | char newline[2]; 29 | size_t newline_length; 30 | 31 | bool* keep; 32 | int column_count; 33 | int first_cell; 34 | } config; 35 | 36 | 37 | static void parse_config(int argc, char** argv); 38 | static void finish_config(size_t cells_found); 39 | 40 | static void output_cells(size_t cells_found, bool last_full); 41 | static void debug_cells(size_t total); 42 | 43 | int main(int argc, char** argv) { 44 | size_t chars_read; 45 | bool first = true; 46 | 47 | parse_config(argc, argv); 48 | 49 | SEQUENTIAL_HINT(config.source); 50 | while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, config.source)) > 0) { 51 | LOG_D("New data read: %zu\n", chars_read); 52 | size_t buffer_consumed = 0; 53 | size_t cells_found = 0; 54 | bool last_full = true; 55 | 56 | prepare_tokenization(_tokenizer, _buffer, chars_read); 57 | while (buffer_consumed < chars_read) { 58 | tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full); 59 | if (first == true) { 60 | first = false; 61 | finish_config(cells_found); 62 | } 63 | 64 | LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found); 65 | debug_cells(cells_found); 66 | output_cells(cells_found, last_full); 67 | } 68 | } 69 | if (_tokenizer != NULL) { 70 | free_tokenizer(_tokenizer); 71 | } 72 | if (config.keep != NULL) { 73 | free(config.keep); 74 | } 75 | if (config.source != stdin) { 76 | fclose(config.source); 77 | } 78 | return 0; 79 | } 80 | 81 | static void debug_cells(size_t total) { 82 | #ifdef MOREDEBUG 83 | Cell* current_cell = _cells; 84 | Cell* cell_end = _cells + total; 85 | 86 | while (current_cell < cell_end) { 87 | if (current_cell->start == NULL) { 88 | LOG_V("Cell %zu : Newline\n", (size_t)(current_cell - _cells)); 89 | } 90 | else if (current_cell->length == 0) { 91 | LOG_V("Cell %zu : \n", (size_t)(current_cell - _cells)); 92 | } 93 | else { 94 | char* s = calloc(sizeof(char), current_cell->length + 1); 95 | s[current_cell->length] = '\0'; 96 | memcpy(s, current_cell->start, current_cell->length); 97 | LOG_V("Cell %zu : %s\n", (size_t)(current_cell - _cells), s); 98 | free(s); 99 | } 100 | current_cell++; 101 | } 102 | #else 103 | (void)total; 104 | #endif 105 | } 106 | 107 | static void print_help() { 108 | fprintf(stderr,"usage: csvcut [OPTIONS] [FILE]"); 109 | fprintf(stderr,"options:"); 110 | fprintf(stderr, "-s ,\n"); 111 | fprintf(stderr, "\tWhich character to use as separator (default is ,)\n"); 112 | fprintf(stderr, "-k column,names,to,keep\n"); 113 | fprintf(stderr, "-d column,names,to,drop\n"); 114 | fprintf(stderr, "-K 0,1,3\n"); 115 | fprintf(stderr, "\tWhich columns to keep\n"); 116 | fprintf(stderr, "-D 0,1,3\n"); 117 | fprintf(stderr, "\tWhich columns to drop\n"); 118 | fprintf(stderr, "-e\n"); 119 | fprintf(stderr, "\tProvide column names one at a time, useful in case of embedded commas.\n"); 120 | } 121 | 122 | enum column_kind { 123 | NONE, 124 | KEEP_NAMES, 125 | KEEP_INDEXES, 126 | DROP_NAMES, 127 | DROP_INDEXES 128 | }; 129 | 130 | static struct { 131 | enum column_kind kind; 132 | size_t cuts_defined; 133 | const char** cuts; 134 | } preconfig; 135 | 136 | static void parse_config(int argc, char** argv) { 137 | config.separator = ','; 138 | config.source = stdin; 139 | 140 | 141 | preconfig.kind = NONE; 142 | preconfig.cuts_defined = 0; 143 | preconfig.cuts = NULL; 144 | 145 | bool one_at_a_time = false; 146 | char c; 147 | while ((c = getopt (argc, argv, "s:k:d:K:D:eh")) != -1) { 148 | switch (c) { 149 | case 'e': 150 | one_at_a_time = true; 151 | break; 152 | case 's': 153 | config.separator = optarg[0]; 154 | break; 155 | case 'k': 156 | case 'd': 157 | case 'K': 158 | case 'D': 159 | if (!one_at_a_time) { 160 | if (preconfig.kind != NONE) { 161 | fprintf(stderr, "Error, you can only pass one kind of cut option.\n"); 162 | exit(1); 163 | } 164 | preconfig.cuts = malloc(sizeof(char*)); 165 | preconfig.cuts_defined = 1; 166 | preconfig.cuts[0] = strtok(optarg, ","); 167 | char* next_column; 168 | while ((next_column = strtok(NULL, ",")) != NULL) { 169 | preconfig.cuts_defined++; 170 | preconfig.cuts = realloc(preconfig.cuts, sizeof(char*) * preconfig.cuts_defined); 171 | preconfig.cuts[preconfig.cuts_defined - 1] = next_column; 172 | } 173 | } 174 | else { 175 | if (!preconfig.cuts) { 176 | preconfig.cuts = malloc(sizeof(char*)); 177 | preconfig.cuts_defined = 1; 178 | } 179 | else { 180 | preconfig.cuts_defined++; 181 | preconfig.cuts = realloc(preconfig.cuts, sizeof(char*) * preconfig.cuts_defined); 182 | } 183 | preconfig.cuts[preconfig.cuts_defined - 1] = optarg; 184 | } 185 | if (c == 'k') { 186 | if (preconfig.kind != NONE && preconfig.kind != KEEP_NAMES) { 187 | fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n"); 188 | print_help(); 189 | exit(1); 190 | } 191 | preconfig.kind = KEEP_NAMES; 192 | } 193 | else if (c == 'd') { 194 | if (preconfig.kind != NONE && preconfig.kind != DROP_NAMES) { 195 | fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n"); 196 | print_help(); 197 | exit(1); 198 | } 199 | preconfig.kind = DROP_NAMES; 200 | } 201 | else if (c == 'K') { 202 | if (preconfig.kind != NONE && preconfig.kind != KEEP_INDEXES) { 203 | fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n"); 204 | print_help(); 205 | exit(1); 206 | } 207 | preconfig.kind = KEEP_INDEXES; 208 | } 209 | else if (c == 'D') { 210 | if (preconfig.kind != NONE && preconfig.kind != DROP_INDEXES) { 211 | fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n"); 212 | print_help(); 213 | exit(1); 214 | } 215 | preconfig.kind = DROP_INDEXES; 216 | } 217 | break; 218 | case '?': 219 | case 'h': 220 | print_help(); 221 | exit(1); 222 | break; 223 | } 224 | } 225 | 226 | if (preconfig.kind == NONE) { 227 | fprintf(stderr, "You should describe how you want to cut the csv\n"); 228 | print_help(); 229 | exit(1); 230 | } 231 | 232 | if (optind < argc) { 233 | config.source = fopen(argv[optind], "r"); 234 | if (!config.source) { 235 | fprintf(stderr, "Could not open file %s for reading\n", argv[optind]); 236 | exit(1); 237 | } 238 | } 239 | 240 | LOG_D("%s\n","Done parsing config params"); 241 | 242 | _tokenizer = setup_tokenizer(config.separator, _buffer, _cells,CELL_BUFFER_SIZE); 243 | } 244 | 245 | static char _unquote_buffer[BUFFER_SIZE]; 246 | static char const * unquote(char const* restrict quoted, size_t* restrict length) { 247 | char * restrict result = _unquote_buffer; 248 | char const * restrict current_char = quoted; 249 | char const * restrict char_end = quoted + *length; 250 | while (current_char < char_end) { 251 | if (*current_char == '"') { 252 | // must be an escaped " 253 | current_char++; 254 | (*length)--; 255 | } 256 | *result++ = *current_char++; 257 | } 258 | return _unquote_buffer; 259 | } 260 | 261 | bool str_contains_n(size_t amount, const char** strings, const char* needle, size_t needle_size) { 262 | if (*needle == '"') { 263 | needle++; 264 | needle_size -= 2; 265 | needle = unquote(needle, &needle_size); 266 | } 267 | for (size_t i = 0; i < amount; i++) { 268 | if (strlen(strings[i]) == needle_size && strncasecmp(strings[i], needle, needle_size) == 0) { 269 | return true; 270 | } 271 | } 272 | return false; 273 | } 274 | 275 | static void finish_config(size_t cells_found) { 276 | debug_cells(cells_found); 277 | 278 | Cell const* current_cell = _cells; 279 | while (current_cell < (_cells + cells_found) && current_cell->start != NULL) { 280 | current_cell++; 281 | } 282 | config.column_count = (int)(current_cell - _cells); 283 | 284 | const char* new_line = _cells[config.column_count-1].start + _cells[config.column_count - 1].length; 285 | config.newline[0] = new_line[0]; 286 | config.newline_length = 1; 287 | if (new_line[1] == '\n' || new_line[0] == '\r') { 288 | config.newline[1] = '\n'; 289 | config.newline_length = 2; 290 | } 291 | 292 | config.keep = calloc(sizeof(bool), config.column_count); 293 | for (int c = 0; c < config.column_count; c++) { 294 | config.keep[c] = false; 295 | } 296 | 297 | if (preconfig.kind == KEEP_NAMES || preconfig.kind == DROP_NAMES) { 298 | for (int c = 0; c < config.column_count; c++) { 299 | bool cond = str_contains_n(preconfig.cuts_defined, preconfig.cuts, _cells[c].start, _cells[c].length); 300 | if ((cond && (preconfig.kind == KEEP_NAMES)) || (!cond && (preconfig.kind == DROP_NAMES))) { 301 | config.keep[c] = true; 302 | } 303 | } 304 | } 305 | else if (preconfig.kind == KEEP_INDEXES || preconfig.kind == DROP_INDEXES) { 306 | for (int c = 0; c < config.column_count; c++) { 307 | char str_index[15]; 308 | int str_length = sprintf(str_index, "%d", c); 309 | bool cond = str_contains_n(preconfig.cuts_defined, preconfig.cuts, str_index, str_length); 310 | if ((cond && (preconfig.kind == KEEP_INDEXES)) || (!cond && (preconfig.kind == DROP_INDEXES))) { 311 | config.keep[c] = true; 312 | } 313 | } 314 | } 315 | else { 316 | assert(false); 317 | } 318 | free(preconfig.cuts); 319 | for (int c = 0; c < config.column_count; c++) { 320 | if (config.keep[c]) { 321 | config.first_cell = c; 322 | break; 323 | } 324 | } 325 | } 326 | 327 | static bool _half_printed = false; 328 | static int _current_cell_id = 0; 329 | 330 | static void output_cells(size_t cells_found, bool last_full) { 331 | LOG_D("Starting output: %zu (%d)\n", cells_found, last_full); 332 | LOG_V("Entry: current_cell: %d\n", _current_cell_id); 333 | Cell const * restrict current_cell = _cells; 334 | Cell const * restrict cell_end = _cells + cells_found; 335 | 336 | char const * restrict current_chunk_start = current_cell->start; 337 | size_t current_chunk_length = 0; 338 | int current_chunk_start_id = _current_cell_id; 339 | 340 | while (current_cell < cell_end) { 341 | //LOG_D("Current cell: %d %p\n", _current_cell_id,current_cell->start); 342 | if (current_cell->start == NULL) { 343 | if (_current_cell_id < config.column_count) { 344 | fprintf(stderr, "Not enough cells in this row, expect: %d, got: %d (cell %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells)); 345 | exit(1); 346 | return; 347 | } 348 | if (current_chunk_start != NULL) { 349 | current_chunk_length--; // take away newline 350 | if (current_chunk_start != _buffer || !_half_printed) { 351 | if (current_chunk_start_id != config.first_cell) { 352 | fwrite(&(config.separator),sizeof(char),1, stdout); 353 | } 354 | } 355 | fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout); 356 | } 357 | fwrite(config.newline, sizeof(char), config.newline_length, stdout); 358 | current_chunk_start = (current_cell + 1)->start; 359 | current_chunk_length = 0; 360 | current_chunk_start_id = 0; 361 | _current_cell_id = -1; 362 | } 363 | if (_current_cell_id >= config.column_count) { 364 | fprintf(stderr, "Too many cells in this row, expect: %d, got: %d (cell: %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells)); 365 | exit(1); 366 | return; 367 | } 368 | else if (config.keep[_current_cell_id]) { 369 | current_chunk_length += 1 + current_cell->length; 370 | } 371 | else { 372 | // a column to drop, so lets write the previous chunk 373 | if (_current_cell_id >= config.first_cell && current_chunk_length > 0) { 374 | current_chunk_length--; // take away last seperator 375 | if (current_chunk_start != _buffer || !_half_printed) { 376 | if (current_chunk_start_id != config.first_cell) { 377 | fwrite(&(config.separator),sizeof(char),1, stdout); 378 | } 379 | } 380 | fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout); 381 | } 382 | // begining of the line, nothing happening 383 | current_chunk_start = (current_cell + 1)->start; 384 | current_chunk_length = 0; 385 | current_chunk_start_id = _current_cell_id + 1; 386 | } 387 | 388 | _current_cell_id++; 389 | current_cell++; 390 | } 391 | if (current_chunk_length > 0) { 392 | current_chunk_length--; // fix of by one error 393 | if (current_chunk_start != _buffer || !_half_printed) { 394 | if (current_chunk_start_id != config.first_cell) { 395 | fwrite(&(config.separator),sizeof(char),1, stdout); 396 | } 397 | } 398 | fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout); 399 | } 400 | if (!last_full) { 401 | 402 | _half_printed = true; 403 | _current_cell_id--; 404 | } 405 | else { 406 | _half_printed = false; 407 | } 408 | LOG_V("Exit: current_cell: %d\n", _current_cell_id); 409 | } 410 | -------------------------------------------------------------------------------- /src/csvgrep.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "csv_tokenizer.h" 8 | #include "debug.h" 9 | #include "hints.h" 10 | #ifndef PCRE_STUDY_JIT_COMPILE 11 | #define PCRE_STUDY_JIT_COMPILE 0 12 | #endif 13 | 14 | #ifndef PCRE_CONFIG_JIT 15 | #define pcre_free_study pcre_free 16 | #endif 17 | 18 | 19 | 20 | //#define BUFFER_SIZE 30 21 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2 22 | 23 | typedef struct { 24 | pcre const* restrict pattern; 25 | pcre_extra const* restrict extra; 26 | } Regex; 27 | 28 | struct csv_tokenizer* _tokenizer; 29 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX]; 30 | static Cell _cells[CELL_BUFFER_SIZE]; 31 | 32 | static int _have_jit = 0; 33 | 34 | static struct { 35 | FILE* source; 36 | char separator; 37 | char newline[2]; 38 | size_t newline_length; 39 | 40 | bool count_only; 41 | bool negative; 42 | bool or; 43 | bool case_insensitive; 44 | Regex* patterns; 45 | 46 | int column_count; 47 | } config; 48 | 49 | static long long _count; 50 | 51 | static char const * unquote(char const* restrict quoted, size_t* restrict length); 52 | static void parse_config(int argc, char** argv); 53 | static size_t finish_config(size_t cells_found); 54 | 55 | static void output_cells(size_t cells_found, size_t offset, bool last_full); 56 | static void debug_cells(size_t total); 57 | 58 | int main(int argc, char** argv) { 59 | 60 | parse_config(argc, argv); 61 | #ifdef PCRE_CONFIG_JIT 62 | pcre_config(PCRE_CONFIG_JIT, &_have_jit); 63 | #else 64 | _have_jit = false; 65 | #endif 66 | if (!_have_jit) { 67 | fprintf(stderr, "I am running without PCRE-JIT support, expect less performance.\n"); 68 | } 69 | 70 | 71 | size_t chars_read; 72 | bool first = true; 73 | SEQUENTIAL_HINT(config.source); 74 | while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, config.source)) > 0) { 75 | LOG_D("New data read: %zu\n", chars_read); 76 | prepare_tokenization(_tokenizer, _buffer, chars_read); 77 | size_t buffer_consumed = 0; 78 | size_t cells_found = 0; 79 | bool last_full = true; 80 | 81 | 82 | while (buffer_consumed < chars_read) { 83 | tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full); 84 | LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found); 85 | debug_cells(cells_found); 86 | 87 | size_t cell_offset = 0; 88 | if (first) { 89 | first = false; 90 | cell_offset = finish_config(cells_found); 91 | } 92 | output_cells(cells_found, cell_offset, last_full); 93 | } 94 | } 95 | if (config.count_only) { 96 | fprintf(stdout, "%llu\n", _count); 97 | } 98 | if (_tokenizer != NULL) { 99 | free_tokenizer(_tokenizer); 100 | } 101 | if (config.patterns != NULL) { 102 | for (int c = 0; c < config.column_count; c++) { 103 | if (config.patterns[c].extra) { 104 | pcre_free_study((pcre_extra*)(config.patterns[c].extra)); 105 | } 106 | pcre_free((pcre*)(config.patterns[c].pattern)); 107 | } 108 | } 109 | if (config.source != stdin) { 110 | fclose(config.source); 111 | } 112 | return 0; 113 | } 114 | 115 | static void debug_cells(size_t total) { 116 | #ifdef MOREDEBUG 117 | Cell* current_cell = _cells; 118 | Cell* cell_end = _cells + total; 119 | 120 | while (current_cell < cell_end) { 121 | if (current_cell->start == NULL) { 122 | LOG_V("Cell %zu : Newline\n", (size_t)(current_cell - _cells)); 123 | } 124 | else if (current_cell->length == 0) { 125 | LOG_V("Cell %zu : \n", (size_t)(current_cell - _cells)); 126 | } 127 | else { 128 | char* s = calloc(sizeof(char), current_cell->length + 1); 129 | s[current_cell->length] = '\0'; 130 | memcpy(s, current_cell->start, current_cell->length); 131 | LOG_V("Cell %zu : %s\n", (size_t)(current_cell - _cells), s); 132 | free(s); 133 | } 134 | current_cell++; 135 | } 136 | #else 137 | (void)total; 138 | #endif 139 | } 140 | 141 | static void print_help() { 142 | fprintf(stderr,"usage: csvgrep [OPTIONS] [FILE]"); 143 | fprintf(stderr,"options:"); 144 | fprintf(stderr, "-s ,\n"); 145 | fprintf(stderr, "\tWhich character to use as separator (default is ,)\n"); 146 | fprintf(stderr, "-p column/pattern/\n"); 147 | fprintf(stderr, "\tMultiple -p are allowed, they work as an AND \n"); 148 | fprintf(stderr, "-i\n"); 149 | fprintf(stderr, "\tuse case insensitive matching\n"); 150 | fprintf(stderr, "-c\n"); 151 | fprintf(stderr, "\tOnly count the rows that match\n"); 152 | fprintf(stderr, "-o\n"); 153 | fprintf(stderr, "\tMake the match into an OR, changes the behavior of -p and -v\n"); 154 | fprintf(stderr, "-v\n"); 155 | fprintf(stderr, "\tPrint only the rows that did not match all patterns\n"); 156 | } 157 | 158 | static struct { 159 | size_t n_patterns; 160 | char ** columns; 161 | char ** patterns; 162 | size_t * column_lengths; 163 | } half_config; 164 | 165 | static void parse_config(int argc, char** argv) { 166 | config.source = stdin; 167 | config.separator = ','; 168 | config.count_only = false; 169 | config.negative = false; 170 | config.case_insensitive = false; 171 | config.or = false; 172 | 173 | half_config.n_patterns = 0; 174 | half_config.columns = malloc(sizeof(char*)); 175 | half_config.patterns = malloc(sizeof(char*)); 176 | half_config.column_lengths = malloc(sizeof(size_t)); 177 | 178 | char c; 179 | while ((c = getopt (argc, argv, "s:p:cvio")) != -1) { 180 | switch (c) { 181 | case 's': 182 | config.separator = optarg[0]; 183 | break; 184 | case 'c': 185 | config.count_only = true; 186 | break; 187 | case 'i': 188 | config.case_insensitive = true; 189 | break; 190 | case 'v': 191 | config.negative = true; 192 | break; 193 | case 'o': 194 | config.or = true; 195 | break; 196 | case 'p': 197 | LOG_V("Got pattern: %s\n", optarg); 198 | char* column_name = strtok(optarg, "/"); 199 | char* column_pattern = strtok(NULL, "/"); 200 | for (size_t pat = 0; pat < half_config.n_patterns; pat++) { 201 | if (strcasecmp(column_name, half_config.columns[pat]) == 0) { 202 | fprintf(stderr, "You can only define one pattern per column (column: %s)\n", column_name); 203 | exit(1); 204 | } 205 | } 206 | half_config.n_patterns++; 207 | if (half_config.n_patterns >= 1) { 208 | half_config.columns = realloc(half_config.columns, sizeof(char*) * half_config.n_patterns); 209 | half_config.patterns = realloc(half_config.patterns, sizeof(char*) * half_config.n_patterns); 210 | half_config.column_lengths = realloc(half_config.column_lengths, sizeof(size_t) * half_config.n_patterns); 211 | } 212 | half_config.columns[half_config.n_patterns - 1] = column_name; 213 | half_config.patterns[half_config.n_patterns - 1] = column_pattern; 214 | half_config.column_lengths[half_config.n_patterns - 1] = strlen(column_name); 215 | break; 216 | case '?': 217 | case 'h': 218 | print_help(); 219 | exit(1); 220 | break; 221 | } 222 | } 223 | if (optind < argc) { 224 | config.source = fopen(argv[optind], "r"); 225 | if (!config.source) { 226 | fprintf(stderr, "Could not open file %s for reading\n", argv[optind]); 227 | exit(1); 228 | } 229 | } 230 | 231 | if (half_config.n_patterns == 0) { 232 | fprintf(stderr, "You should at least provide one pattern\n"); 233 | print_help(); 234 | exit(1); 235 | } 236 | 237 | LOG_D("%s\n","Done parsing config params"); 238 | 239 | _tokenizer = setup_tokenizer(config.separator, _buffer, _cells, CELL_BUFFER_SIZE); 240 | 241 | } 242 | 243 | static size_t finish_config(size_t cells_found) { 244 | 245 | Cell* current_cell = _cells; 246 | while (current_cell < (_cells + cells_found) && current_cell->start != NULL) { 247 | if (!config.count_only) { 248 | // also immediatly print the header 249 | if (current_cell != _cells) { 250 | fwrite(&(config.separator),sizeof(char),1, stdout); 251 | } 252 | fwrite(current_cell->start, sizeof(char), current_cell->length, stdout); 253 | } 254 | current_cell++; 255 | } 256 | config.column_count = (int)(current_cell - _cells); 257 | 258 | const char* new_line = _cells[config.column_count-1].start + _cells[config.column_count - 1].length; 259 | config.newline[0] = new_line[0]; 260 | config.newline_length = 1; 261 | if (new_line[1] == '\n' && new_line[0] == '\r') { 262 | config.newline[1] = '\n'; 263 | config.newline_length = 2; 264 | } 265 | if (!config.count_only) { 266 | fwrite(config.newline, sizeof(char), config.newline_length, stdout); 267 | } 268 | 269 | bool* used = calloc(sizeof(bool), half_config.n_patterns); 270 | memset(used, 0, sizeof(bool) * half_config.n_patterns); 271 | config.patterns = calloc(sizeof(Regex),config.column_count); 272 | memset(config.patterns, 0, sizeof(Regex) * config.column_count); 273 | for (int c = 0; c < config.column_count; c++) { 274 | const char* column = _cells[c].start; 275 | size_t length = _cells[c].length; 276 | if (*column == '"') { 277 | column++; 278 | length -= 2; 279 | column = unquote(column, &length); 280 | } 281 | for (size_t pat = 0; pat < half_config.n_patterns; pat++) { 282 | if (!used[pat] && length == half_config.column_lengths[pat]) { 283 | if (strncasecmp(column, half_config.columns[pat], half_config.column_lengths[pat])==0) { 284 | used[pat] = true; 285 | LOG_V("Adding pattern %s for column: %s (%d)\n", half_config.patterns[pat], half_config.columns[pat],c); 286 | // we have found the column 287 | const char *pcreErrorStr; 288 | int pcreErrorOffset; 289 | config.patterns[c].pattern = pcre_compile(half_config.patterns[pat], PCRE_DOLLAR_ENDONLY | PCRE_DOTALL | PCRE_NO_UTF8_CHECK | (config.case_insensitive ? PCRE_CASELESS : 0), &pcreErrorStr, &pcreErrorOffset, NULL); 290 | if(config.patterns[c].pattern == NULL) { 291 | fprintf(stderr, "ERROR: Could not compile '%s': %s\n", half_config.patterns[pat], pcreErrorStr); 292 | exit(1); 293 | } 294 | config.patterns[c].extra = pcre_study(config.patterns[c].pattern,(_have_jit ? PCRE_STUDY_JIT_COMPILE : 0), &pcreErrorStr); 295 | if(config.patterns[c].extra == NULL && pcreErrorStr != NULL) { 296 | fprintf(stderr, "ERROR: Could not study '%s': %s\n", half_config.patterns[pat], pcreErrorStr); 297 | exit(1); 298 | } 299 | break; 300 | } 301 | } 302 | } 303 | } 304 | 305 | bool stop = false; 306 | for (size_t pat = 0; pat < half_config.n_patterns; pat++) { 307 | if (!used[pat]) { 308 | fprintf(stderr, "ERROR: The column \"%s\" was not found in the header\n", half_config.columns[pat]); 309 | stop = true; 310 | } 311 | } 312 | if (stop) { 313 | fprintf(stderr, "Exiting\n"); 314 | exit(1); 315 | } 316 | 317 | free(used); 318 | free(half_config.columns); 319 | free(half_config.patterns); 320 | free(half_config.column_lengths); 321 | 322 | return config.column_count + 1 ; 323 | } 324 | 325 | 326 | // data for around the edges 327 | static char _prev_line[BUFFER_SIZE * 2]; 328 | static size_t _prev_line_length = 0; 329 | static char _prev_cell[BUFFER_SIZE]; 330 | static size_t _prev_cell_length = 0; 331 | 332 | // state of the output 333 | static int _current_cell_id = 0; 334 | static bool _half_line = false; 335 | static bool _half_cell = false; 336 | static bool _prev_matches = true; 337 | 338 | static void output_cells(size_t cells_found, size_t offset, bool last_full) { 339 | LOG_D("Starting output: %zu (%d)\n", cells_found, last_full); 340 | LOG_V("Entry: current_cell: %d\n", _current_cell_id); 341 | 342 | Cell const* restrict current_cell = _cells + offset; 343 | Cell const* restrict cells_end = _cells + cells_found; 344 | 345 | bool matches = !config.or; 346 | if (_half_line) { 347 | matches = _prev_matches; 348 | } 349 | char const* restrict current_line_start = current_cell->start; 350 | size_t current_line_length = 0; 351 | 352 | while (current_cell < cells_end) { 353 | if (_current_cell_id > config.column_count) { 354 | fprintf(stderr, "Too many cells in this row, expect: %d, got: %d (cell: %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells)); 355 | exit(1); 356 | return; 357 | } 358 | if (current_cell->start == NULL) { 359 | if (_current_cell_id == config.column_count) { 360 | // end of the line 361 | if (matches) { 362 | if (config.count_only) { 363 | _half_line = false; 364 | _prev_line_length = 0; 365 | _count++; 366 | } 367 | else { 368 | if (_half_line) { 369 | LOG_V("Printed previous half line %zu\n", _prev_line_length); 370 | fwrite(_prev_line, sizeof(char), _prev_line_length, stdout); 371 | _half_line = false; 372 | _prev_line_length = 0; 373 | } 374 | fwrite(current_line_start, sizeof(char), current_line_length, stdout); 375 | fwrite(config.newline, sizeof(char), config.newline_length, stdout); 376 | } 377 | } 378 | else if (_half_line) { 379 | // we stored the previos part of this line, but it can be dropped 380 | _half_line = false; 381 | _prev_line_length = 0; 382 | } 383 | current_line_start = (current_cell + 1)->start; 384 | current_line_length = 0; 385 | _current_cell_id = -1; 386 | matches = !config.or; 387 | } 388 | else if (_current_cell_id < config.column_count) { 389 | fprintf(stderr, "Not enough cells in this row, expect: %d, got: %d (cell %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells)); 390 | exit(1); 391 | return; 392 | } 393 | } 394 | else if (matches || config.or) { // only if we have a match does it make sense to test other cells 395 | current_line_length += 1 + current_cell->length; 396 | if (_current_cell_id == 0 || current_cell == (_cells + offset)) { 397 | current_line_length--; // the first doesn't have a separator 398 | } 399 | if (config.patterns[_current_cell_id].pattern != NULL) { 400 | char const* restrict cell = current_cell->start; 401 | size_t length = current_cell->length; 402 | if (current_cell == (cells_end-1) && !last_full) { 403 | // we do not have the full cell at the moment, let's copy it 404 | size_t old_cell_length = _prev_cell_length; 405 | _prev_cell_length += current_cell->length; 406 | memcpy(_prev_cell + old_cell_length, current_cell->start, sizeof(char) * current_cell->length); 407 | _half_cell = true; 408 | _current_cell_id++; 409 | break; 410 | } 411 | if (_half_cell && current_cell == _cells) { 412 | // append the current cell to the back of the previous one. 413 | assert(_prev_cell_length + length < BUFFER_SIZE); 414 | memcpy(_prev_cell + _prev_cell_length, cell, sizeof(char) * length); 415 | cell = _prev_cell; 416 | length += _prev_cell_length; 417 | _prev_cell_length = 0; 418 | } 419 | if (length > 1 && cell[0] == '"') { 420 | cell++; 421 | length -= 2; 422 | char const* restrict c = cell-1; 423 | char const* restrict cell_end = cell + length; 424 | while (++c < cell_end && *c != '"'); 425 | if (c != cell_end) { 426 | // we have nested quotes 427 | cell = unquote(cell, &length); 428 | } 429 | } 430 | int ovector[255]; 431 | int matchResult = pcre_exec(config.patterns[_current_cell_id].pattern, config.patterns[_current_cell_id].extra, cell, length, 0, 0, ovector, 255); 432 | if (config.or) { 433 | matches |= (matchResult >= 0) ^ config.negative; 434 | } 435 | else { 436 | matches &= (matchResult >= 0) ^ config.negative; 437 | } 438 | #ifdef MOREDEBUG 439 | if (matchResult < 0) { 440 | fprintf(stderr, "tried to match :'"); 441 | fwrite(cell, sizeof(char), length, stderr); 442 | fprintf(stderr, "'\n"); 443 | switch(matchResult) { 444 | case PCRE_ERROR_NOMATCH : fprintf(stderr,"String did not match the pattern\n"); break; 445 | case PCRE_ERROR_NULL : fprintf(stderr,"Something was null\n"); break; 446 | case PCRE_ERROR_BADOPTION : fprintf(stderr,"A bad option was passed\n"); break; 447 | case PCRE_ERROR_BADMAGIC : fprintf(stderr,"Magic number bad (compiled re corrupt?)\n"); break; 448 | case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr,"Something kooky in the compiled re\n"); break; 449 | case PCRE_ERROR_NOMEMORY : fprintf(stderr,"Ran out of memory\n"); break; 450 | default : fprintf(stderr,"Unknown error\n"); break; 451 | } 452 | } 453 | #endif 454 | } 455 | } 456 | 457 | _current_cell_id++; 458 | current_cell++; 459 | } 460 | if (_current_cell_id != 0) { 461 | // the last row wasn't completly printed, so we must be inside a row 462 | _prev_matches = matches; 463 | if (_prev_matches) { 464 | // it could still match, so let's copy the line 465 | size_t old_line_length = _prev_line_length; 466 | _prev_line_length += current_line_length; 467 | assert(_prev_line_length < (BUFFER_SIZE * 2)); 468 | memcpy(_prev_line + old_line_length, current_line_start, sizeof(char) * current_line_length); 469 | if (last_full && _current_cell_id != config.column_count) { // the , gets eaten away 470 | _prev_line[_prev_line_length++] = config.separator; 471 | } 472 | #ifdef MOREDEBUG 473 | fprintf(stderr, "current prev line :'"); 474 | fwrite(_prev_line, sizeof(char), _prev_line_length, stderr); 475 | fprintf(stderr, "'\n"); 476 | #endif 477 | } 478 | _half_line = true; 479 | if (!last_full) { 480 | _current_cell_id--; 481 | } 482 | } 483 | else { 484 | _half_line = false; 485 | } 486 | LOG_V("Exit: current_cell: %d\n", _current_cell_id); 487 | } 488 | 489 | 490 | static char _unquote_buffer[BUFFER_SIZE]; 491 | static char const * unquote(char const* restrict quoted, size_t* restrict length) { 492 | char * restrict result = _unquote_buffer; 493 | char const * restrict current_char = quoted; 494 | char const * restrict char_end = quoted + *length; 495 | while (current_char < char_end) { 496 | if (*current_char == '"') { 497 | // must be an escaped " 498 | current_char++; 499 | (*length)--; 500 | } 501 | *result++ = *current_char++; 502 | } 503 | return _unquote_buffer; 504 | } 505 | -------------------------------------------------------------------------------- /src/csvpipe.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "debug.h" 8 | #include "hints.h" 9 | 10 | 11 | #define NULL_ENCODED '\x1a' 12 | 13 | //#define BUFFER_SIZE 3 14 | static char _buffer[BUFFER_SIZE]; 15 | 16 | struct { 17 | FILE* source; 18 | bool drop_header; 19 | } config; 20 | 21 | static void parse_config(int argc, char** argv); 22 | static void do_pipe(size_t chars_read); 23 | 24 | int main(int argc, char** argv) { 25 | parse_config(argc, argv); 26 | 27 | size_t chars_read; 28 | SEQUENTIAL_HINT(config.source); 29 | while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, config.source)) > 0) { 30 | do_pipe(chars_read); 31 | } 32 | return 0; 33 | } 34 | 35 | static void print_help() { 36 | fprintf(stderr, "usage: csvpipe [OPTIONS] [FILE]"); 37 | fprintf(stderr, "options:"); 38 | fprintf(stderr, "-d\n"); 39 | fprintf(stderr, " drop header row\n"); 40 | } 41 | 42 | static void parse_config(int argc, char** argv) { 43 | config.source = stdin; 44 | config.drop_header = false; 45 | char c; 46 | while ((c = getopt (argc, argv, "d")) != -1) { 47 | switch (c) { 48 | case 'd': 49 | config.drop_header = true; 50 | break; 51 | case '?': 52 | case 'h': 53 | default: 54 | print_help(); 55 | exit(1); 56 | break; 57 | } 58 | } 59 | if (optind < argc) { 60 | config.source = fopen(argv[optind], "r"); 61 | if (!config.source) { 62 | fprintf(stderr, "Could not open file %s for reading\n", argv[optind]); 63 | exit(1); 64 | } 65 | } 66 | } 67 | 68 | enum tokenizer_state { 69 | FRESH, 70 | PREV_NEWLINE, 71 | PREV_QUOTE, 72 | IN_QUOTE, 73 | }; 74 | 75 | void replace_zeroes(char* restrict current_char, char const* restrict char_end) { 76 | while (current_char != NULL) { 77 | current_char = memchr(current_char, '\0', char_end - current_char); 78 | if (current_char != NULL) { 79 | *current_char = NULL_ENCODED; 80 | } 81 | } 82 | } 83 | 84 | 85 | static bool first_run = true; 86 | static enum tokenizer_state _state = FRESH; 87 | 88 | static void do_pipe(size_t chars_read) { 89 | char* restrict current_char = _buffer; 90 | char const* restrict char_end = _buffer + chars_read; 91 | char const* restrict current_start = _buffer; 92 | LOG_V("Piping: %zu state: %d first char: %c\n", chars_read, _state, *current_char); 93 | 94 | if (config.drop_header && first_run) { 95 | while (current_char < char_end) { 96 | if (*current_char == '\n' || *current_char == '\r') { 97 | if (*current_char == '\r') { 98 | _state = PREV_NEWLINE; // handle the windows newlines correctly 99 | } 100 | current_start = ++current_char; 101 | first_run = false; 102 | break; 103 | } 104 | current_char++; 105 | } 106 | if (current_char == char_end) { 107 | return; 108 | } 109 | } 110 | // doing this separatly greatly improves the speed of the loop below 111 | replace_zeroes(current_char, char_end); 112 | 113 | switch(_state) { 114 | case PREV_QUOTE: 115 | _state = FRESH; // reset state 116 | if (*current_char == '"') { 117 | // we have two quotes 118 | // one in the previous block, one in the current 119 | goto IN_QUOTE; 120 | } 121 | // we were at the end of the quoted cell, so let's continue 122 | break; 123 | case IN_QUOTE: 124 | current_char--; // the loop starts with a increment 125 | goto IN_QUOTE; 126 | case PREV_NEWLINE: 127 | if (*current_char == '\n') { 128 | // we already had a newline, so lets eat this second windows 129 | // newline 130 | current_char++; 131 | current_start++; 132 | } 133 | _state = FRESH; 134 | break; 135 | default: 136 | break; 137 | } 138 | 139 | while (current_char < char_end) { 140 | if (*current_char == '"') { 141 | IN_QUOTE: 142 | while (++current_char < char_end) { 143 | if (*current_char == '"') { 144 | char const* peek = current_char + 1; 145 | if (peek == char_end) { 146 | current_char++; 147 | _state = PREV_QUOTE; 148 | // at the end of stream and not sure if escaped or not 149 | break; 150 | } 151 | else if (*peek == '"') { 152 | current_char++; 153 | continue; 154 | } 155 | else { 156 | break; 157 | } 158 | } 159 | } 160 | if (current_char == char_end) { 161 | // we are at the end, let's write everything we've seen 162 | if (_state != PREV_QUOTE) { 163 | _state = IN_QUOTE; 164 | } 165 | break; 166 | } 167 | else { 168 | current_char++; 169 | _state = FRESH; 170 | } 171 | } 172 | else if (*current_char == '\n') { 173 | *current_char = '\0'; 174 | current_char++; 175 | } 176 | else if (*current_char == '\r') { 177 | *current_char = '\0'; 178 | current_char++; 179 | if (current_char == char_end) { 180 | _state = PREV_NEWLINE; 181 | break; 182 | } 183 | else if (*current_char == '\n') { 184 | // we have windows new lines, so lets skip over this byte 185 | fwrite(current_start, sizeof(char), current_char - current_start, stdout); 186 | current_char++; 187 | current_start = current_char; 188 | } 189 | } 190 | else { 191 | // all other chars, just skip one 192 | current_char++; 193 | } 194 | } 195 | if (current_start < char_end) { 196 | fwrite(current_start, sizeof(char), char_end - current_start, stdout); 197 | } 198 | } 199 | 200 | -------------------------------------------------------------------------------- /src/csvunpipe.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "debug.h" 9 | #include "hints.h" 10 | 11 | 12 | #define NULL_ENCODED '\x1a' 13 | 14 | //#define BUFFER_SIZE 3 15 | 16 | static char _buffer[BUFFER_SIZE]; 17 | 18 | static FILE* _source; 19 | 20 | static void parse_config(int argc, char** argv); 21 | static void do_unpipe(size_t chars_read); 22 | 23 | int main(int argc, char** argv) { 24 | parse_config(argc, argv); 25 | 26 | size_t chars_read; 27 | SEQUENTIAL_HINT(_source); 28 | while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, _source)) > 0) { 29 | do_unpipe(chars_read); 30 | } 31 | if (_source != stdin) { 32 | fclose(_source); 33 | } 34 | return 0; 35 | } 36 | 37 | static void print_help() { 38 | fprintf(stderr, "usage: csvunpipe [OPTIONS] [FILE]"); 39 | fprintf(stderr, "options:"); 40 | fprintf(stderr, "-p header,row,to,print\n"); 41 | fprintf(stderr, " Header row to print first\n"); 42 | } 43 | 44 | static void parse_config(int argc, char** argv) { 45 | _source = stdin; 46 | char c; 47 | while ((c = getopt (argc, argv, "p:")) != -1) { 48 | switch (c) { 49 | case 'p': 50 | fwrite(optarg, sizeof(char), strlen(optarg), stdout); 51 | fwrite("\n", sizeof(char), 1, stdout); 52 | break; 53 | case '?': 54 | case 'h': 55 | default: 56 | print_help(); 57 | exit(1); 58 | break; 59 | } 60 | } 61 | if (optind < argc) { 62 | _source = fopen(argv[optind], "r"); 63 | if (!_source) { 64 | fprintf(stderr, "Could not open file %s for reading\n", argv[optind]); 65 | exit(1); 66 | } 67 | } 68 | } 69 | 70 | static void do_unpipe(size_t chars_read) { 71 | char* restrict current_char = _buffer; 72 | char const* restrict char_end = _buffer + chars_read; 73 | 74 | while (current_char != NULL) { 75 | current_char = memchr(current_char, '\0', char_end - current_char); 76 | if (current_char != NULL) { 77 | *current_char = '\n'; 78 | } 79 | } 80 | current_char = _buffer; 81 | while (current_char != NULL) { 82 | current_char = memchr(current_char, NULL_ENCODED, char_end - current_char); 83 | if (current_char != NULL) { 84 | *current_char = '\0'; 85 | } 86 | } 87 | fwrite(_buffer, sizeof(char), chars_read, stdout); 88 | } 89 | 90 | -------------------------------------------------------------------------------- /src/debug.h: -------------------------------------------------------------------------------- 1 | #define debug_print(fmt, ...) do { fprintf(stderr, fmt, __VA_ARGS__); } while (0) 2 | #ifdef DEBUG 3 | #define LOG_D(fmt, ...) debug_print(" D: "fmt, __VA_ARGS__) 4 | #else 5 | #define LOG_D(fmt, ...) 6 | #endif 7 | 8 | #ifdef MOREDEBUG 9 | #define LOG_V(fmt, ...) debug_print(" V: "fmt, __VA_ARGS__) 10 | #else 11 | #define LOG_V(fmt, ...) 12 | #endif 13 | -------------------------------------------------------------------------------- /src/hints.h: -------------------------------------------------------------------------------- 1 | #ifndef HINTS_H 2 | #define HINTS_H 3 | #if _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L 4 | #include 5 | #define SEQUENTIAL_HINT(fd) if (posix_fadvise(fileno(fd), 0, 0, POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE)) { ; } 6 | #else 7 | #define SEQUENTIAL_HINT(fd) 8 | #endif 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /test/csv_tokenizer_counts.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../src/csv_tokenizer.h" 7 | #include "../src/debug.h" 8 | 9 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2 + 1 10 | struct csv_tokenizer* _tokenizer; 11 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX]; 12 | static Cell _cells[CELL_BUFFER_SIZE]; 13 | 14 | int main(int argc, char** argv) { 15 | (void)argv; 16 | if (argc > 1) { 17 | fprintf(stderr, "This tool is for testing only, pipe a csv into it\n"); 18 | return 0; 19 | } 20 | size_t chars_read; 21 | unsigned long long cell_total = 0; 22 | _tokenizer = setup_tokenizer(',', _buffer, _cells, CELL_BUFFER_SIZE); 23 | while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, stdin)) > 0) { 24 | LOG_D("New data read: %zu\n", chars_read); 25 | prepare_tokenization(_tokenizer, _buffer, chars_read); 26 | size_t buffer_consumed = 0; 27 | size_t cells_found = 0; 28 | bool last_full = true; 29 | 30 | while (buffer_consumed < chars_read) { 31 | tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full); 32 | LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found); 33 | cell_total += cells_found; 34 | if (!last_full) { 35 | cell_total--; 36 | } 37 | } 38 | } 39 | fprintf(stdout, "%llu cells\n", cell_total); 40 | return 0; 41 | } 42 | -------------------------------------------------------------------------------- /test/csvawk/corners_command: -------------------------------------------------------------------------------- 1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }') 2 | -------------------------------------------------------------------------------- /test/csvawk/corners_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvawk/corners_output.csv: -------------------------------------------------------------------------------- 1 | abcd"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" -------------------------------------------------------------------------------- /test/csvawk/large_command: -------------------------------------------------------------------------------- 1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }') 2 | -------------------------------------------------------------------------------- /test/csvawk/large_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvawk/large_output.csv: -------------------------------------------------------------------------------- 1 | column1column2column3foo 2 bar"foo 2 | 3 | " 3 bar"foo "" 4 | "5bar 5 | -------------------------------------------------------------------------------- /test/csvawk/simple_command: -------------------------------------------------------------------------------- 1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }') 2 | -------------------------------------------------------------------------------- /test/csvawk/simple_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvawk/simple_output.csv: -------------------------------------------------------------------------------- 1 | abcde12345234563456745678 -------------------------------------------------------------------------------- /test/csvcut/canada_keep_note_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d Note) 2 | -------------------------------------------------------------------------------- /test/csvcut/canada_keep_note_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/csvcut/canada_keep_note_output.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvcut/canada_keep_note_output.csv.xz -------------------------------------------------------------------------------- /test/csvcut/column_quoted1_command: -------------------------------------------------------------------------------- 1 | ARGS=(-e -k "a a" -k "c,") 2 | -------------------------------------------------------------------------------- /test/csvcut/column_quoted1_input.csv: -------------------------------------------------------------------------------- 1 | ../data/quoted_columns.csv -------------------------------------------------------------------------------- /test/csvcut/column_quoted1_output.csv: -------------------------------------------------------------------------------- 1 | "a a","c," 2 | 1,3 3 | 2,4 4 | 3,5 5 | 4,6 6 | -------------------------------------------------------------------------------- /test/csvcut/corners_drop_ab_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d a,b) 2 | -------------------------------------------------------------------------------- /test/csvcut/corners_drop_ab_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvcut/corners_drop_ab_output.csv: -------------------------------------------------------------------------------- 1 | c,d 2 | , 3 | , 4 | , 5 | , 6 | ,"""""""""""""" 7 | -------------------------------------------------------------------------------- /test/csvcut/corners_keep_ab_command: -------------------------------------------------------------------------------- 1 | ARGS=(-k a,b) 2 | -------------------------------------------------------------------------------- /test/csvcut/corners_keep_ab_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvcut/corners_keep_ab_output.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | , 3 | , 4 | , 5 | """""""""""""","""""""""""" 6 | """""""""""""","""""""""""" 7 | -------------------------------------------------------------------------------- /test/csvcut/large_keep_12_command: -------------------------------------------------------------------------------- 1 | ARGS=(-K 1,2) 2 | -------------------------------------------------------------------------------- /test/csvcut/large_keep_12_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvcut/large_keep_12_output.csv: -------------------------------------------------------------------------------- 1 | large_keep_col23_output.csv -------------------------------------------------------------------------------- /test/csvcut/large_keep_col1_command: -------------------------------------------------------------------------------- 1 | ARGS=(-k column1) 2 | -------------------------------------------------------------------------------- /test/csvcut/large_keep_col1_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvcut/large_keep_col1_output.csv: -------------------------------------------------------------------------------- 1 | column1 2 | foo 3 | "foo 4 | 5 | " 6 | "foo "" 7 | " 8 | -------------------------------------------------------------------------------- /test/csvcut/large_keep_col23_command: -------------------------------------------------------------------------------- 1 | ARGS=(-k column2,column3) 2 | -------------------------------------------------------------------------------- /test/csvcut/large_keep_col23_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvcut/large_keep_col23_output.csv: -------------------------------------------------------------------------------- 1 | column2,column3 2 | 2, bar 3 | 3, bar 4 | 5,bar 5 | -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names2_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d aaa) 2 | -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names2_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple_overlapping_columns.csv -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names2_output.csv: -------------------------------------------------------------------------------- 1 | a,aa,aaaa,b 2 | 1,2,4,5 3 | 2,3,5,6 4 | 3,4,6,7 5 | 4,5,7,8 6 | -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d a) 2 | -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple_overlapping_columns.csv -------------------------------------------------------------------------------- /test/csvcut/overlapping_column_names_output.csv: -------------------------------------------------------------------------------- 1 | aa,aaa,aaaa,b 2 | 2,3,4,5 3 | 3,4,5,6 4 | 4,5,6,7 5 | 5,6,7,8 6 | -------------------------------------------------------------------------------- /test/csvcut/simple_drop_a_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d a) 2 | -------------------------------------------------------------------------------- /test/csvcut/simple_drop_a_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvcut/simple_drop_a_output.csv: -------------------------------------------------------------------------------- 1 | b,c,d,e 2 | 2,3,4,5 3 | 3,4,5,6 4 | 4,5,6,7 5 | 5,6,7,8 6 | -------------------------------------------------------------------------------- /test/csvcut/simple_drop_ab_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d a,b) 2 | -------------------------------------------------------------------------------- /test/csvcut/simple_drop_ab_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvcut/simple_drop_ab_output.csv: -------------------------------------------------------------------------------- 1 | c,d,e 2 | 3,4,5 3 | 4,5,6 4 | 5,6,7 5 | 6,7,8 6 | -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ab_command: -------------------------------------------------------------------------------- 1 | ARGS=(-k a,b) 2 | -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ab_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ab_output.csv: -------------------------------------------------------------------------------- 1 | a,b 2 | 1,2 3 | 2,3 4 | 3,4 5 | 4,5 6 | -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ae_command: -------------------------------------------------------------------------------- 1 | ARGS=(-k a,e) 2 | -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ae_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvcut/simple_keep_ae_output.csv: -------------------------------------------------------------------------------- 1 | a,e 2 | 1,5 3 | 2,6 4 | 3,7 5 | 4,8 6 | -------------------------------------------------------------------------------- /test/csvgrep/char_range_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p Topic/[A-Z][a-e]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/char_range_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/csvgrep/char_range_output.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvgrep/char_range_output.csv.xz -------------------------------------------------------------------------------- /test/csvgrep/empty_cell_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a/^$/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/empty_cell_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvgrep/empty_cell_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | ,,, 3 | ,,, 4 | ,,, 5 | -------------------------------------------------------------------------------- /test/csvgrep/integer_range_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p Characteristic/201[0-2]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/integer_range_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/csvgrep/integer_range_output.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvgrep/integer_range_output.csv.xz -------------------------------------------------------------------------------- /test/csvgrep/not_option-text_command: -------------------------------------------------------------------------------- 1 | ARGS=(-v -p "column3/(foo|bar)/") 2 | -------------------------------------------------------------------------------- /test/csvgrep/not_option-text_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvgrep/not_option-text_output.csv: -------------------------------------------------------------------------------- 1 | column1,column2,column3 2 | -------------------------------------------------------------------------------- /test/csvgrep/not_quoted_cell_command: -------------------------------------------------------------------------------- 1 | ARGS=(-v -p a/[\"]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/not_quoted_cell_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvgrep/not_quoted_cell_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | ,,, 3 | ,,, 4 | ,,, 5 | -------------------------------------------------------------------------------- /test/csvgrep/one_field_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a/[0-9]+/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/one_field_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvgrep/one_field_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,e 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | 3,4,5,6,7 5 | 4,5,6,7,8 6 | -------------------------------------------------------------------------------- /test/csvgrep/option-text_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p 'column1/(foo|bar)/') 2 | -------------------------------------------------------------------------------- /test/csvgrep/option-text_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvgrep/option-text_output.csv: -------------------------------------------------------------------------------- 1 | column1,column2,column3 2 | foo, 2, bar 3 | "foo 4 | 5 | ", 3, bar 6 | "foo "" 7 | ",5,bar 8 | -------------------------------------------------------------------------------- /test/csvgrep/option2-text_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p 'column3/(foo|bar)/') 2 | -------------------------------------------------------------------------------- /test/csvgrep/option2-text_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvgrep/option2-text_output.csv: -------------------------------------------------------------------------------- 1 | column1,column2,column3 2 | foo, 2, bar 3 | "foo 4 | 5 | ", 3, bar 6 | "foo "" 7 | ",5,bar 8 | -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns1_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p aa/[1-2]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns1_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple_overlapping_columns.csv -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns1_output.csv: -------------------------------------------------------------------------------- 1 | a,aa,aaa,aaaa,b 2 | 1,2,3,4,5 3 | -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns2_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p aaa/[3-4]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns2_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple_overlapping_columns.csv -------------------------------------------------------------------------------- /test/csvgrep/overlapping_columns2_output.csv: -------------------------------------------------------------------------------- 1 | a,aa,aaa,aaaa,b 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | -------------------------------------------------------------------------------- /test/csvgrep/quoted_cell_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a/[\"]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/quoted_cell_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvgrep/quoted_cell_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | """""""""""""","""""""""""",, 3 | """""""""""""","""""""""""",,"""""""""""""" 4 | -------------------------------------------------------------------------------- /test/csvgrep/two_NOT_field_command: -------------------------------------------------------------------------------- 1 | ARGS=(-v -p a/[1-2]/ -p b/[2-3]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/two_NOT_field_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvgrep/two_NOT_field_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,e 2 | 3,4,5,6,7 3 | 4,5,6,7,8 4 | -------------------------------------------------------------------------------- /test/csvgrep/two_field_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a/[1-2]/ -p b/[2-3]/) 2 | -------------------------------------------------------------------------------- /test/csvgrep/two_field_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvgrep/two_field_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,e 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | -------------------------------------------------------------------------------- /test/csvpipe/canada_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/canada_command -------------------------------------------------------------------------------- /test/csvpipe/canada_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/csvpipe/canada_output.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/canada_output.csv.xz -------------------------------------------------------------------------------- /test/csvpipe/corners_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/corners_command -------------------------------------------------------------------------------- /test/csvpipe/corners_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvpipe/corners_output.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,,,,,,,,,"""""""""""""","""""""""""",,"""""""""""""","""""""""""",,"""""""""""""" -------------------------------------------------------------------------------- /test/csvpipe/drop_header_command: -------------------------------------------------------------------------------- 1 | ARGS=(-d) 2 | -------------------------------------------------------------------------------- /test/csvpipe/drop_header_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvpipe/drop_header_output.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4,52,3,4,5,63,4,5,6,74,5,6,7,8 2 | -------------------------------------------------------------------------------- /test/csvpipe/large-fields_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/large-fields_command -------------------------------------------------------------------------------- /test/csvpipe/large-fields_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvpipe/large-fields_output.csv: -------------------------------------------------------------------------------- 1 | foo, 2, bar"foo 2 | 3 | ", 3, bar"foo "" 4 | ",5,bar 5 | -------------------------------------------------------------------------------- /test/csvpipe/simple_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/simple_command -------------------------------------------------------------------------------- /test/csvpipe/simple_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvpipe/simple_output.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4,52,3,4,5,63,4,5,6,74,5,6,7,8 2 | -------------------------------------------------------------------------------- /test/csvtokenizercounts/canada_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/canada_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/canada_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/csvtokenizercounts/canada_output.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/canada_output.csv.xz -------------------------------------------------------------------------------- /test/csvtokenizercounts/corners_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/corners_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/corners_input.csv: -------------------------------------------------------------------------------- 1 | ../data/corners.csv -------------------------------------------------------------------------------- /test/csvtokenizercounts/corners_output.csv: -------------------------------------------------------------------------------- 1 | 30 cells 2 | -------------------------------------------------------------------------------- /test/csvtokenizercounts/large-fields_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/large-fields_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/large-fields_input.csv: -------------------------------------------------------------------------------- 1 | ../data/large-fields.csv -------------------------------------------------------------------------------- /test/csvtokenizercounts/large-fields_output.csv: -------------------------------------------------------------------------------- 1 | 16 cells 2 | -------------------------------------------------------------------------------- /test/csvtokenizercounts/quoted_columns_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/quoted_columns_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/quoted_columns_input.csv: -------------------------------------------------------------------------------- 1 | ../data/quoted_columns.csv -------------------------------------------------------------------------------- /test/csvtokenizercounts/quoted_columns_output.csv: -------------------------------------------------------------------------------- 1 | 30 cells 2 | -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/simple_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple.csv -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_output.csv: -------------------------------------------------------------------------------- 1 | 30 cells 2 | -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_overlapping_columns_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/simple_overlapping_columns_command -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_overlapping_columns_input.csv: -------------------------------------------------------------------------------- 1 | ../data/simple_overlapping_columns.csv -------------------------------------------------------------------------------- /test/csvtokenizercounts/simple_overlapping_columns_output.csv: -------------------------------------------------------------------------------- 1 | 30 cells 2 | -------------------------------------------------------------------------------- /test/csvunpipe/canada_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvunpipe/canada_command -------------------------------------------------------------------------------- /test/csvunpipe/canada_input.csv.xz: -------------------------------------------------------------------------------- 1 | ../csvpipe/canada_output.csv.xz -------------------------------------------------------------------------------- /test/csvunpipe/canada_output.csv.xz: -------------------------------------------------------------------------------- 1 | ../csvpipe/canada_input.csv.xz -------------------------------------------------------------------------------- /test/csvunpipe/corners_command: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvunpipe/corners_command -------------------------------------------------------------------------------- /test/csvunpipe/corners_input.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/corners_output.csv -------------------------------------------------------------------------------- /test/csvunpipe/corners_output.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/corners_input.csv -------------------------------------------------------------------------------- /test/csvunpipe/drop_header_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a,b,c,d,e) 2 | -------------------------------------------------------------------------------- /test/csvunpipe/drop_header_input.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/drop_header_output.csv -------------------------------------------------------------------------------- /test/csvunpipe/drop_header_output.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/drop_header_input.csv -------------------------------------------------------------------------------- /test/csvunpipe/large-fields_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p column1,column2,column3) 2 | -------------------------------------------------------------------------------- /test/csvunpipe/large-fields_input.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/large-fields_output.csv -------------------------------------------------------------------------------- /test/csvunpipe/large-fields_output.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/large-fields_input.csv -------------------------------------------------------------------------------- /test/csvunpipe/simple_command: -------------------------------------------------------------------------------- 1 | ARGS=(-p a,b,c,d,e) 2 | -------------------------------------------------------------------------------- /test/csvunpipe/simple_input.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/simple_output.csv -------------------------------------------------------------------------------- /test/csvunpipe/simple_output.csv: -------------------------------------------------------------------------------- 1 | ../csvpipe/simple_input.csv -------------------------------------------------------------------------------- /test/data/canada-2011-census.csv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/data/canada-2011-census.csv.xz -------------------------------------------------------------------------------- /test/data/corners.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d 2 | ,,, 3 | ,,, 4 | ,,, 5 | """""""""""""","""""""""""",, 6 | """""""""""""","""""""""""",,"""""""""""""" 7 | -------------------------------------------------------------------------------- /test/data/large-fields.csv: -------------------------------------------------------------------------------- 1 | column1,column2,column3 2 | foo, 2, bar 3 | "foo 4 | 5 | ", 3, bar 6 | "foo "" 7 | ",5,bar 8 | -------------------------------------------------------------------------------- /test/data/quoted_columns.csv: -------------------------------------------------------------------------------- 1 | "a a","b","c,"," d","e" 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | 3,4,5,6,7 5 | 4,5,6,7,8 6 | -------------------------------------------------------------------------------- /test/data/simple.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,e 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | 3,4,5,6,7 5 | 4,5,6,7,8 6 | -------------------------------------------------------------------------------- /test/data/simple_overlapping_columns.csv: -------------------------------------------------------------------------------- 1 | a,aa,aaa,aaaa,b 2 | 1,2,3,4,5 3 | 2,3,4,5,6 4 | 3,4,5,6,7 5 | 4,5,6,7,8 6 | -------------------------------------------------------------------------------- /test/runtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | PROGRAM=$1 3 | LARGE_FILES=$2 4 | RESULT=0 5 | 6 | 7 | test_normal() { 8 | REF_FILE=$OUTPUT 9 | REF=$(cat "$OUTPUT") 10 | OUTPUT=$("../bin/$PROGRAM" "${ARGS[@]}" < "$INPUT") 11 | if (($? > 0)); then 12 | printf "\t- %s params: \"%s\" = \t Failed (%s crashed)\n" "$INPUT" "${ARGS[*]}" "$PROGRAM" 13 | RESULT=1 14 | return 15 | fi 16 | 17 | 18 | if [ "$OUTPUT" != "$REF" ]; then 19 | printf "\t- %s params: \"%s\" = \t Failed\n" "$INPUT" "${ARGS[*]}" 20 | printf "$OUTPUT" > /tmp/error-output.csv 21 | diff -a -d "$REF_FILE" /tmp/error-output.csv 22 | rm /tmp/error-output.csv 23 | printf "" 24 | RESULT=1 25 | else 26 | printf "\t- %s params: \"%s\" = \t OK\n" "$INPUT" "${ARGS[*]}" 27 | fi 28 | } 29 | 30 | test_xz() { 31 | REF=$(xzcat "$OUTPUT" | openssl md5) 32 | OUTPUT=$(xzcat "$INPUT" | "../bin/$PROGRAM" "${ARGS[@]}" | openssl md5) 33 | if (($? > 0)); then 34 | printf "\t- %s params: \"%s\" = \t Failed (%s crashed)\n" "$INPUT" "${ARGS[*]}" "$PROGRAM" 35 | RESULT=1 36 | return 37 | fi 38 | 39 | 40 | if [ "$OUTPUT" != "$REF" ]; then 41 | printf "\t- %s params: \"%s\" = \t Failed\n" "$INPUT" "${ARGS[*]}" 42 | RESULT=1 43 | else 44 | printf "\t- %s params: \"%s\" = \t OK\n" "$INPUT" "${ARGS[*]}" 45 | fi 46 | } 47 | printf "Testing $PROGRAM" 48 | OUTPUT=$("../bin/$PROGRAM" -h 2>&1 | wc -l) 49 | if (($? > 0)) || [ $OUTPUT -lt 1 ]; then 50 | printf "\t- %s has no help params" "$PROGRAM" 51 | RESULT=1 52 | fi 53 | 54 | for INPUT in $PROGRAM/*_input.csv*; 55 | do 56 | source "$(printf $INPUT | sed 's/input\.csv.*$/command/')" 57 | #ARGS=$(cat "$(printf $INPUT | sed 's/input\.csv.*$/command/')") 58 | OUTPUT=$(printf $INPUT | sed 's/input/output/') 59 | case $INPUT in 60 | *.csv.xz ) 61 | if (($LARGE_FILES == 1)); then 62 | test_xz 63 | fi 64 | ;; 65 | *.csv ) 66 | test_normal 67 | ;; 68 | esac 69 | done 70 | if [ $RESULT == 0 ]; then 71 | printf "Tests succeeded\n" 72 | else 73 | printf "Tests failed\n" 74 | fi 75 | exit $RESULT 76 | -------------------------------------------------------------------------------- /test/test-sizes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # run from root dir! 3 | 4 | 5 | EXTRA_FLAGS="" 6 | if [ "$#" -ne 0 ]; then 7 | EXTRA_FLAGS="$@" 8 | fi 9 | 10 | set -e 11 | 12 | #silent function from https://serverfault.com/questions/607884 13 | SILENT_LOG=/tmp/silent_log_$$.txt 14 | trap "/bin/rm -f $SILENT_LOG" EXIT 15 | 16 | report_and_exit() { 17 | cat "${SILENT_LOG}" 18 | exit 1 19 | } 20 | 21 | silent() { 22 | `rm -f ${SILENT_LOG}` 23 | $* 2>>"${SILENT_LOG}" >> "${SILENT_LOG}" || report_and_exit; 24 | } 25 | 26 | test_with_size() { 27 | if (($1 > 30)); then 28 | if (($1 > 72)) ; then # csvcut has to read the full header 29 | if (($1 > 145)); then # csvgrep has to fit the max line length in 2*BUFFER_SIZE 30 | make test-csvcut test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS 31 | else 32 | make test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g SKIP_LARGE_FILES=1 $EXTRA_FLAGS 33 | if (($? > 0)); then 34 | echo "\033[91mFailure with size $1\033[39m" 35 | return 1 36 | fi 37 | make test-csvcut BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS 38 | fi 39 | else 40 | make test-csvcut test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g SKIP_LARGE_FILES=1 $EXTRA_FLAGS 41 | fi 42 | fi 43 | if (($? > 0)); then 44 | echo "\033[91mFailure with size $1\033[39m" 45 | return 1 46 | fi 47 | make test-csvpipe test-csvunpipe test-csvpipe test-csvunpipe test-csvawk test-tokenizer BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS 48 | if (($? > 0)); then 49 | echo "\033[91mFailure with size $1\033[39m" 50 | return 1 51 | fi 52 | return 0 53 | } 54 | 55 | echo "Testing predefined sizes" 56 | for s in 1 2 3 4 5 6 7 8 11 16 21 24 32 36 63 128 1024; 57 | do 58 | silent "make deep-clean" 59 | echo "Testing size: \t $s" 60 | silent test_with_size $s 61 | done 62 | 63 | echo "Trying 40 random sizes" 64 | for x in $(seq 1 40); 65 | do 66 | silent "make deep-clean" 67 | RANDOMNUM=$( head -200 /dev/urandom | cksum | cut -f1 -d " ") 68 | s=$(( ( RANDOMNUM % 400 ) + 1 )); 69 | echo "Testing size: \t $s (run $x/40)" 70 | silent test_with_size $s 71 | done 72 | --------------------------------------------------------------------------------