├── .bettercodehub.yml
├── .github
    └── workflows
    │   └── c-cpp.yml
├── .gitignore
├── .gitmodules
├── .travis.yml
├── LICENSE.txt
├── Makefile
├── README.md
├── bench
    ├── csvkit-csvcut.py
    ├── csvkit-csvgrep.py
    ├── generate.c
    ├── generate.h
    ├── runner.c
    └── timer.h
├── src
    ├── .gitignore
    ├── csv_tokenizer.c
    ├── csv_tokenizer.h
    ├── csvawk.c
    ├── csvcut.c
    ├── csvgrep.c
    ├── csvpipe.c
    ├── csvunpipe.c
    ├── debug.h
    └── hints.h
└── test
    ├── csv_tokenizer_counts.c
    ├── csvawk
        ├── corners_command
        ├── corners_input.csv
        ├── corners_output.csv
        ├── large_command
        ├── large_input.csv
        ├── large_output.csv
        ├── simple_command
        ├── simple_input.csv
        └── simple_output.csv
    ├── csvcut
        ├── canada_keep_note_command
        ├── canada_keep_note_input.csv.xz
        ├── canada_keep_note_output.csv.xz
        ├── column_quoted1_command
        ├── column_quoted1_input.csv
        ├── column_quoted1_output.csv
        ├── corners_drop_ab_command
        ├── corners_drop_ab_input.csv
        ├── corners_drop_ab_output.csv
        ├── corners_keep_ab_command
        ├── corners_keep_ab_input.csv
        ├── corners_keep_ab_output.csv
        ├── large_keep_12_command
        ├── large_keep_12_input.csv
        ├── large_keep_12_output.csv
        ├── large_keep_col1_command
        ├── large_keep_col1_input.csv
        ├── large_keep_col1_output.csv
        ├── large_keep_col23_command
        ├── large_keep_col23_input.csv
        ├── large_keep_col23_output.csv
        ├── overlapping_column_names2_command
        ├── overlapping_column_names2_input.csv
        ├── overlapping_column_names2_output.csv
        ├── overlapping_column_names_command
        ├── overlapping_column_names_input.csv
        ├── overlapping_column_names_output.csv
        ├── simple_drop_a_command
        ├── simple_drop_a_input.csv
        ├── simple_drop_a_output.csv
        ├── simple_drop_ab_command
        ├── simple_drop_ab_input.csv
        ├── simple_drop_ab_output.csv
        ├── simple_keep_ab_command
        ├── simple_keep_ab_input.csv
        ├── simple_keep_ab_output.csv
        ├── simple_keep_ae_command
        ├── simple_keep_ae_input.csv
        └── simple_keep_ae_output.csv
    ├── csvgrep
        ├── char_range_command
        ├── char_range_input.csv.xz
        ├── char_range_output.csv.xz
        ├── empty_cell_command
        ├── empty_cell_input.csv
        ├── empty_cell_output.csv
        ├── integer_range_command
        ├── integer_range_input.csv.xz
        ├── integer_range_output.csv.xz
        ├── not_option-text_command
        ├── not_option-text_input.csv
        ├── not_option-text_output.csv
        ├── not_quoted_cell_command
        ├── not_quoted_cell_input.csv
        ├── not_quoted_cell_output.csv
        ├── one_field_command
        ├── one_field_input.csv
        ├── one_field_output.csv
        ├── option-text_command
        ├── option-text_input.csv
        ├── option-text_output.csv
        ├── option2-text_command
        ├── option2-text_input.csv
        ├── option2-text_output.csv
        ├── overlapping_columns1_command
        ├── overlapping_columns1_input.csv
        ├── overlapping_columns1_output.csv
        ├── overlapping_columns2_command
        ├── overlapping_columns2_input.csv
        ├── overlapping_columns2_output.csv
        ├── quoted_cell_command
        ├── quoted_cell_input.csv
        ├── quoted_cell_output.csv
        ├── two_NOT_field_command
        ├── two_NOT_field_input.csv
        ├── two_NOT_field_output.csv
        ├── two_field_command
        ├── two_field_input.csv
        └── two_field_output.csv
    ├── csvpipe
        ├── canada_command
        ├── canada_input.csv.xz
        ├── canada_output.csv.xz
        ├── corners_command
        ├── corners_input.csv
        ├── corners_output.csv
        ├── drop_header_command
        ├── drop_header_input.csv
        ├── drop_header_output.csv
        ├── large-fields_command
        ├── large-fields_input.csv
        ├── large-fields_output.csv
        ├── simple_command
        ├── simple_input.csv
        └── simple_output.csv
    ├── csvtokenizercounts
        ├── canada_command
        ├── canada_input.csv.xz
        ├── canada_output.csv.xz
        ├── corners_command
        ├── corners_input.csv
        ├── corners_output.csv
        ├── large-fields_command
        ├── large-fields_input.csv
        ├── large-fields_output.csv
        ├── quoted_columns_command
        ├── quoted_columns_input.csv
        ├── quoted_columns_output.csv
        ├── simple_command
        ├── simple_input.csv
        ├── simple_output.csv
        ├── simple_overlapping_columns_command
        ├── simple_overlapping_columns_input.csv
        └── simple_overlapping_columns_output.csv
    ├── csvunpipe
        ├── canada_command
        ├── canada_input.csv.xz
        ├── canada_output.csv.xz
        ├── corners_command
        ├── corners_input.csv
        ├── corners_output.csv
        ├── drop_header_command
        ├── drop_header_input.csv
        ├── drop_header_output.csv
        ├── large-fields_command
        ├── large-fields_input.csv
        ├── large-fields_output.csv
        ├── simple_command
        ├── simple_input.csv
        └── simple_output.csv
    ├── data
        ├── canada-2011-census.csv.xz
        ├── corners.csv
        ├── large-fields.csv
        ├── quoted_columns.csv
        ├── simple.csv
        └── simple_overlapping_columns.csv
    ├── runtest.sh
    └── test-sizes.sh


/.bettercodehub.yml:
--------------------------------------------------------------------------------
 1 | component_depth: 1
 2 | languages:
 3 | - name: cpp
 4 |   production:
 5 |     include:
 6 |       - src/*.c
 7 |       - src/*.h
 8 |     exclude:
 9 |       - test/*
10 |       - bench/*
11 |   test:
12 |     include:
13 |       - test/*
14 |       - bench/*
15 | 


--------------------------------------------------------------------------------
/.github/workflows/c-cpp.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "master", "main" ]
 6 |   pull_request:
 7 |     branches: [ "master", "main" ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |     - name: compile
17 |       run: make
18 |     - name: test
19 |       run: make test
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bin-old/
2 | bin/
3 | *.swp
4 | *.gcda
5 | *.gcno
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bench/deps/awk-csv-parser"]
2 | 	path = bench/deps/awk-csv-parser
3 | 	url = https://github.com/geoffroy-aubry/awk-csv-parser.git
4 | [submodule "bench/deps/pcg-c-basic"]
5 | 	path = bench/deps/pcg-c-basic
6 | 	url = https://github.com/imneme/pcg-c-basic.git
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | compiler: clang
 3 | 
 4 | addons:
 5 |   apt:
 6 |      packages:
 7 |          - libpcre3-dev
 8 |   sonarcloud:
 9 |       organization: "davylandman-github"
10 | 
11 | script:
12 |    - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then build-wrapper-linux-x86-64 --out-dir output make test test-all-sizes-ci DISABLE_ASSERTS="" COVERAGE=1; fi'
13 |    - 'if [ "$TRAVIS_PULL_REQUEST" = "true" ]; then make test test-all-sizes-ci DISABLE_ASSERTS="" COVERAGE=1; fi'
14 | 
15 | after_success:
16 |    - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then sonar-scanner -Dsonar.sources=. -Dsonar.projectKey="DavyLandman_csvtools" -Dsonar.cfamily.build-wrapper-output=output; fi'
17 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Davy Landman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | BUFFER_SIZE=1048576 # 1024K can be overridden with make BUFFER_SIZE=20
  2 | LinkFlags=
  3 | CFLAGS+=-std=gnu99 -Wall -pedantic -Wextra -DBUFFER_SIZE=$(BUFFER_SIZE) -fno-strict-aliasing
  4 | 
  5 | DISABLE_ASSERTS=-DNDEBUG
  6 | ifdef DEBUG # set with `make .. DEBUG=1`
  7 | CFLAGS+=-g -DDEBUG
  8 | ifdef VERBOSE
  9 | CFLAGS+=-DMOREDEBUG
 10 | endif
 11 | else
 12 | CFLAGS+=-O3 $(DISABLE_ASSERTS)
 13 | endif
 14 | ifdef PERF
 15 | CFLAGS+=-lprofiler -g
 16 | endif
 17 | 
 18 | DO_COVERAGE=""
 19 | ifdef COVERAGE
 20 | CFLAGS+=-coverage
 21 | DO_COVERAGE="COVERAGE=1"
 22 | endif
 23 | 
 24 | 
 25 | ifndef TEST_SLOW_PATH
 26 | 	UNAME_S := $(shell uname -s)
 27 | 	ifeq ($(UNAME_S),Linux)
 28 | 		CFLAGS += -D_GNU_SOURCE
 29 | 	endif
 30 | else
 31 | 	CFLAGS += -D_SLOW_PATH
 32 | endif
 33 | 
 34 | CSV_GREP_FILES = src/csvgrep.c src/csv_tokenizer.c
 35 | CSV_CUT_FILES = src/csvcut.c src/csv_tokenizer.c
 36 | CSV_TOK_TEST_COUNT_FILES = test/csv_tokenizer_counts.c src/csv_tokenizer.c
 37 | CSV_PIPE_FILES = src/csvpipe.c
 38 | CSV_UNPIPE_FILES = src/csvunpipe.c
 39 | CSV_AWK_FILES = src/csvawk.c
 40 | BENCH_FILES = bench/runner.c bench/generate.c bench/deps/pcg-c-basic/pcg_basic.c
 41 | 
 42 | .PHONY: all test clean test-csvgrep test-csvcut test-csvpipe test-csvunpipe test-all-sizes test-tokenizer install
 43 | 
 44 | all: bin/csvcut bin/csvgrep bin/csvpipe bin/csvunpipe bin/csvawk bin/csvawk
 45 | 
 46 | # yes, we recompile csv_tokenizer, it keeps the makefile simpler and it allows
 47 | # the compiler to do some cross module optimizations :)
 48 | 
 49 | bench: bin/bench
 50 | bin/bench: $(BENCH_FILES) bin/ all
 51 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(BENCH_FILES) 
 52 | 
 53 | bench/deps/pcg-c-basic/pcg_basic.c:
 54 | 	(cd bench/deps/pcg-c-basic/ && git submodule init && git submodule update)
 55 | 	(cd bench/deps/awk-csv-parser/ && git submodule init && git submodule update)
 56 | 
 57 | csvcut: bin/csvcut
 58 | bin/csvcut: $(CSV_CUT_FILES) Makefile bin/
 59 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_CUT_FILES) 
 60 | 
 61 | csvpipe: bin/csvpipe
 62 | bin/csvpipe: $(CSV_PIPE_FILES) Makefile bin/
 63 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_PIPE_FILES) 
 64 | 
 65 | csvunpipe: bin/csvunpipe
 66 | bin/csvunpipe: $(CSV_UNPIPE_FILES) Makefile bin/
 67 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_UNPIPE_FILES) 
 68 | 
 69 | csvawk: bin/csvawk
 70 | bin/csvawk: $(CSV_AWK_FILES) Makefile bin/
 71 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_AWK_FILES) 
 72 | 
 73 | csvgrep: bin/csvgrep
 74 | bin/csvgrep: $(CSV_GREP_FILES) Makefile bin/
 75 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) `pcre-config --cflags` $(CSV_GREP_FILES) `pcre-config --libs`
 76 | 
 77 | bin/csvtokenizercounts: $(CSV_TOK_TEST_COUNT_FILES) Makefile bin/
 78 | 	$(CC) -o $@ $(LinkFlags) $(CFLAGS) $(CSV_TOK_TEST_COUNT_FILES)
 79 | 
 80 | bin/:
 81 | 	mkdir bin/
 82 | 
 83 | ifdef SKIP_LARGE_FILES
 84 | LARGE_FILES=0
 85 | else
 86 | LARGE_FILES=1
 87 | endif
 88 | 
 89 | test: test-csvgrep test-csvcut test-csvpipe test-csvunpipe test-csvawk test-tokenizer
 90 | 
 91 | test-csvgrep: bin/csvgrep
 92 | 	cd test && ./runtest.sh csvgrep $(LARGE_FILES) $(DO_COVERAGE)
 93 | 
 94 | test-csvcut: bin/csvcut
 95 | 	cd test && ./runtest.sh csvcut $(LARGE_FILES) $(DO_COVERAGE)
 96 | 	
 97 | test-csvpipe: bin/csvpipe
 98 | 	cd test && ./runtest.sh csvpipe $(LARGE_FILES) $(DO_COVERAGE)
 99 | 
100 | test-csvunpipe: bin/csvunpipe
101 | 	cd test && ./runtest.sh csvunpipe $(LARGE_FILES) $(DO_COVERAGE)
102 | 
103 | test-csvawk: bin/csvawk
104 | 	cd test && ./runtest.sh csvawk $(LARGE_FILES) $(DO_COVERAGE)
105 | 
106 | test-tokenizer: bin/csvtokenizercounts
107 | 	cd test && ./runtest.sh csvtokenizercounts $(LARGE_FILES) $(DO_COVERAGE)
108 | 
109 | test-all-sizes: 
110 | 	 ./test/test-sizes.sh $(DO_COVERAGE)
111 | 
112 | test-all-sizes-ci: 
113 | 	 curl -s https://codecov.io/bash > /tmp/codecov.sh
114 | 	 bash /tmp/codecov.sh -x "llvm-cov gcov"
115 | 	 ./test/test-sizes.sh $(DO_COVERAGE)
116 | 	 bash /tmp/codecov.sh -x "llvm-cov gcov"
117 | 	 ./test/test-sizes.sh $(DO_COVERAGE) TEST_SLOW_PATH=1
118 | 	 bash /tmp/codecov.sh -x "llvm-cov gcov"
119 | 
120 | 
121 | 
122 | prefix=/usr/local
123 |     
124 | install: all
125 | 	install -m 0755 bin/csvcut $(prefix)/bin/csvcut
126 | 	install -m 0755 bin/csvgrep $(prefix)/bin/csvgrep
127 | 	install -m 0755 bin/csvawk $(prefix)/bin/csvawk
128 | 	install -m 0755 bin/csvpipe $(prefix)/bin/csvpipe
129 | 	install -m 0755 bin/csvunpipe $(prefix)/bin/csvunpipe
130 | 
131 | clean:
132 | 	rm -rf bin/*
133 | deep-clean:
134 | 	rm -rf bin/*
135 | 	rm -rf *.gc{ov,da,no}
136 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # csvtools, fast processing of CSV streams
  2 | [![Build Status](https://travis-ci.org/DavyLandman/csvtools.svg?branch=master)](https://travis-ci.org/DavyLandman/csvtools)
  3 | [![Coverity Scan Build Status](https://img.shields.io/coverity/scan/5024.svg)](https://scan.coverity.com/projects/5024)
  4 | [![codecov.io](https://codecov.io/github/DavyLandman/csvtools/coverage.svg?branch=master)](https://codecov.io/github/DavyLandman/csvtools?branch=master)
  5 | 
  6 | 
  7 | As our data gets bigger, CSV files grow in size.
  8 | The CSV format is not exactly pipe-friendly due to embedded newlines and quoted separators.
  9 | [onyxfish/csvkit](https://github.com/onyxfish/csvkit) offers a great set of utilties for most tasks you would want to perform on CSV's in a gnu toolset kind of way.
 10 | However, it is not fast. For reasonable data sets, this doesn't matter, but for CSVs of more than a few MBs, you start to feel the pain.
 11 | 
 12 | This repository contains gnu-alike tools for parsing [RFC 4180](https://tools.ietf.org/html/rfc4180) CSVs at high speed.
 13 | 
 14 | ## Tools
 15 | 
 16 | - `csvcut` a `cut(1)` equivalent to drop columns from a csv file
 17 | - `csvgrep` a `grep(1)` equivalent to match on one or more collumns per row, and only keep the rows matching all or any of the patterns. (it uses PRCE for regular expression goodness)
 18 | - `csvawk` a wrapper for `awk(1)` which correctly recognizes rows and cells (even across newlines). This is comparable to [geoffroy-aubry/awk-csv-parser](https://github.com/geoffroy-aubry/awk-csv-parser), except that it also supports embedded newlines.
 19 | - `csvpipe` and `csvunpipe` translate the newlines separating rows to `\0` such that `sort -z` and `uniq -z` and other null-terminated-line based tools can be used more correctly.
 20 | 
 21 | ## Performance
 22 | 
 23 | Benchmarking is complicated, the primary goal is to measure only that of interest, by reducing the impact of other factors. Originally csvtools was benchmarked on the [Canada 2011 census](http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/prof/details/download-telecharger/comprehensive/comp-csv-tab-dwnld-tlchrgr.cfm?Lang=E), however, we were primarily measuring the limits of the SSD and the caches around the file system. 
 24 | 
 25 | Now we benchmark with a custom tool: [`bench/runner.c`](bench/runner.c). This benchmark first generates an in memory random csv data set (see [`bench/generate.c`](bench/generate.c)), and then pipes this into the applications under test. This at least takes the IO and FS out of the equation.
 26 | 
 27 | we compare `csvtools` with other solutions. Note that these solutions might not correctly handle CSV's. The reported numbers are _median_ MiB/s.
 28 | 
 29 | ### Pure pipe speed
 30 | 
 31 | | command | median speed |
 32 | | :-- | --: |
 33 | | `cat > /dev/null` | 2042.1 MiB/s |
 34 | | `wc -l > /dev/null` | 2149.0 MiB/s |
 35 | | `md5sum > /dev/null` | 566.8 MiB/s |
 36 | 
 37 | 
 38 | ### csvcut
 39 | 
 40 | | scenario | csvkit | cut | sed | csvtools |
 41 | | :--- | ---: | ---: | ---: | ---: |
 42 | | first column | 8.0 MiB/s | 278.8 MiB/s | 356.9 MiB/s | _644.1 MiB/s_ |
 43 | | middle column  | 8.1 MiB/s | 280.3 MiB/s |  138.6 MiB/s | _555.8 MiB/s_ |
 44 | | last column | 8.0 MiB/s | 280.0 MiB/s | 90.1 MiB/s | _565.0 MiB/s_ |
 45 | | two adjoining columns | 7.3 MiB/s | 359 MiB/s | 59.6 MiB/s | _561.6 MiB/s_ |
 46 | | two distinct columns | 7.3 MiB/s | 449 MiB/s | 59.8 MiB/s | _480.9 MiB/s_ |
 47 | 
 48 | So even compared to sed or cut, which aren't handling quoted separators correctly, our `csvcut` is much faster. 
 49 | 
 50 | ### csvgrep
 51 | 
 52 | | scenario | csvkit | grep | awk | csvtools |
 53 | | :--- | ---: | ---: | ---: | ---: |
 54 | | first column | 7.6 MiB/s | 347.9 MiB/s | 469.2 MiB/s | _588.0 MiB/s_ |
 55 | | middle column | 7.8 MiB/s | 302.8 MiB/s | 379.3 MiB/s | _579.0 MiB/s_ |
 56 | | last column | 7.7 MiB/s | 392.7 MiB/s | 341.5 MiB/s | _632.5 MiB/s_ |
 57 | | two distinct columns | 9.0 MiB/s | 273.9 MiB/s | 380.0 MiB/s | _569.7 MiB/s_ |
 58 | 
 59 | Faster than grep and awk, this is because the column selection in grep is done with negative character classes multiple times.
 60 | 
 61 | There are off course regular expressions possible where PCRE is slower than grep.
 62 | 
 63 | ### csvawk
 64 | 
 65 | | scenario | awk | awk-csv-parser | csvtools |
 66 | | :--- | ---: | ---: | ---: |
 67 | | print second column | 428.5 MiB/s | 2.45 MiB/s | _278.5 MiB/s_ |
 68 | | sum last column | 350.5 MiB/s | 2.4 MiB/s | _225.9 MiB/s_ |
 69 | 
 70 | Sadly, `csvawk` is slower than pure `awk`. This is caused by the custom record separator (instead of the normal newline). Benchmarking `csvawk` piping to `awk` shows it performs around 800 MiB/s, and if newlines are used as separators, the whole `csvawk` performs around similar to `awk`'s raw performance. 
 71 | 
 72 | However, newlines are not valid separators, since they can occur inside quoted fields. For `csvawk` we generate [`\x1E`](https://en.wikipedia.org/wiki/C0_and_C1_control_codes#Field_separators) between records (as per ISO 646), and [`\x1F`](https://en.wikipedia.org/wiki/C0_and_C1_control_codes#Field_separators) between fields in a record. 
 73 | 
 74 | The results of the second benchmark differ, since awk doesn't correctly handle nested separators.
 75 | 
 76 | ### Why so fast?
 77 | No malloc & memcpy!
 78 | 
 79 | Or as valgrind reports it:
 80 | ```
 81 | ==2473==   total heap usage: 18 allocs, 18 frees, 210 bytes allocated
 82 | ```
 83 | 
 84 | In the critical path of tokenizing the csv stream and writing it to `stdout`, there are no copies or memory allocations. The programs read into a buffer from `stdin` (or the file passed as last argument), the tokenizer stores offsets (to that buffer) and lenghts in a cell array, and the printer writes from the same buffer, using the offsets and lengths from the cell array. 
 85 | 
 86 | ## Instalation
 87 | 
 88 | 1. Clone this repository
 89 | 2. Navigate to it
 90 | 2. `make install` (or with prefix: `make install prefix=~/.apps/`)
 91 | 3. enjoy :)
 92 | 
 93 | ## Future work
 94 | 
 95 | - Decide on issue #4
 96 | - Think of better names that don't clash with csvkit?
 97 | - More tests
 98 | - add option to remove the header
 99 | - sort on columns?
100 | 
101 | 


--------------------------------------------------------------------------------
/bench/csvkit-csvcut.py:
--------------------------------------------------------------------------------
 1 | ## EASY-INSTALL-ENTRY-SCRIPT: 'csvkit==0.9.0','console_scripts','csvjson'
 2 | __requires__ = 'csvkit==0.9.0'
 3 | import sys
 4 | from pkg_resources import load_entry_point
 5 | 
 6 | if __name__ == '__main__':
 7 |     sys.exit(
 8 |         load_entry_point('csvkit==0.9.0', 'console_scripts', 'csvcut')()
 9 |     )
10 | 


--------------------------------------------------------------------------------
/bench/csvkit-csvgrep.py:
--------------------------------------------------------------------------------
 1 | ## EASY-INSTALL-ENTRY-SCRIPT: 'csvkit==0.9.0','console_scripts','csvjson'
 2 | __requires__ = 'csvkit==0.9.0'
 3 | import sys
 4 | from pkg_resources import load_entry_point
 5 | 
 6 | if __name__ == '__main__':
 7 |     sys.exit(
 8 |         load_entry_point('csvkit==0.9.0', 'console_scripts', 'csvgrep')()
 9 |     )
10 | 


--------------------------------------------------------------------------------
/bench/generate.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <limits.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <stdbool.h>
  6 | #include "deps/pcg-c-basic/pcg_basic.h"
  7 | #include "generate.h"
  8 | 
  9 | 
 10 | #define MAX(a,b) (((a) > (b)) ? (a) : (b))
 11 | 
 12 | inline static bool one_every(pcg32_random_t* rng, int one_in) {
 13 |     return pcg32_random_r(rng) < (UINT32_MAX / one_in);
 14 | }
 15 | 
 16 | static double random_float(pcg32_random_t* rng) {
 17 |     return ldexp(pcg32_random_r(rng), -32);
 18 | }
 19 | 
 20 | #define RANDOM_RANGE(rng, a,b) ((a) + pcg32_boundedrand_r((rng), (b) - (a)))
 21 | 
 22 | static char random_alpha(pcg32_random_t* rng) {
 23 |     if (one_every(rng, 2)) {
 24 |         return RANDOM_RANGE(rng, 'A', 'Z');
 25 |     }
 26 |     return RANDOM_RANGE(rng, 'a', 'z');
 27 | }
 28 | 
 29 | static char random_numeric(pcg32_random_t* rng) {
 30 |     return RANDOM_RANGE(rng, '0', '9');
 31 | }
 32 | 
 33 | static char random_alpha_numeric(pcg32_random_t* rng) {
 34 |     if (one_every(rng, 2)) {
 35 |         return random_numeric(rng);
 36 |     }
 37 |     return random_alpha(rng); 
 38 | }
 39 | 
 40 | 
 41 | static size_t random_cell(pcg32_random_t* rng, char* restrict target, const unsigned int columns, const size_t cell_size_max) {
 42 |     size_t written = 0;
 43 |     for (unsigned int i = 0; i < columns; i++) {
 44 |         if (i > 0) {
 45 |             *target++ =',';
 46 |             written++;
 47 |         }
 48 |         size_t cell_size = pcg32_boundedrand_r(rng, random_float(rng) < 0.2 ? cell_size_max : MAX(1, cell_size_max / 40));
 49 |         if (cell_size < 2) {
 50 |             cell_size = 2;
 51 |         }
 52 |         if (one_every(rng, 3)) {
 53 |             for (size_t c = 0; c < cell_size; c++) {
 54 |                 *target++ = random_numeric(rng);
 55 |             }
 56 |         }
 57 |         else if (!one_every(rng, 10)) {
 58 |             for (size_t c = 0; c < cell_size; c++) {
 59 |                 *target++ = random_alpha_numeric(rng);
 60 |             }
 61 |         }
 62 |         else {
 63 |             *target++ = '"';
 64 |             written++;
 65 |             cell_size -= 2;
 66 |             for (size_t c = 0; c < cell_size; c++) {
 67 |                 *target++ = random_alpha(rng);
 68 |                 if (c + 2 < cell_size) {
 69 |                     if (one_every(rng, 4)) {
 70 |                         *target++ = ' ';
 71 |                         cell_size--;
 72 |                         written++;
 73 |                     }
 74 |                     else if (one_every(rng, 6)) {
 75 |                         *target++ = ',';
 76 |                         cell_size--;
 77 |                         written++;
 78 |                     }
 79 |                     else if (one_every(rng, 100)) {
 80 |                         *target++ ='"';
 81 |                         *target++ ='"';
 82 |                         written += 2;
 83 |                         cell_size -= 2;
 84 |                     }
 85 |                     else if (one_every(rng, 1000)) {
 86 |                         *target++ ='\n';
 87 |                         cell_size--;
 88 |                         written++;
 89 |                     }
 90 |                 }
 91 |             }
 92 |             *target++ = '"';
 93 |             written++;
 94 |         }
 95 |         written += cell_size;
 96 |     }
 97 |     *target++ = '\n';
 98 |     written++;
 99 |     return written;
100 | }
101 | 
102 | size_t generate_csv(char* restrict buffer, size_t size, size_t* ten_percent, unsigned int seed1, unsigned int seed2, unsigned int columns) {
103 |     const size_t original_size = size;
104 |     char* restrict current_char = buffer;
105 |     for (unsigned int i = 1; i <= columns; i++) {
106 |         if (i > 1) {
107 |             *current_char++ = ',';
108 |             size--;
109 |         }
110 |         memcpy(current_char, "column", 6);
111 |         current_char += 6;
112 |         size -= 6;
113 |         int len = snprintf(current_char, (CHAR_BIT * sizeof(int) - 1) / 3 + 2, "%d", i);
114 |         current_char += len;
115 |         size -= len;
116 |     }
117 |     *current_char++ = '\n';
118 |     size--;
119 | 
120 |     pcg32_random_t rng;
121 | 
122 |     pcg32_srandom_r(&rng, seed1, seed2);
123 | 
124 |     unsigned int cell_large_max = 255;
125 | 
126 |     while (size > ((cell_large_max + 1) * columns + 1)) {
127 |         size_t written = random_cell(&rng, current_char, columns, cell_large_max);
128 |         current_char += written;
129 |         size -= written;
130 |         if (original_size - size < (original_size / 10)) {
131 |             *ten_percent = original_size - size;
132 |         }
133 |     }
134 |     size_t written = random_cell(&rng, current_char, columns, 2);
135 |     current_char += written;
136 |     return current_char - buffer;
137 | }
138 | 


--------------------------------------------------------------------------------
/bench/generate.h:
--------------------------------------------------------------------------------
1 | #ifndef _GENERATE_H
2 | #define _GENERATE_H
3 | size_t generate_csv(char* restrict buffer, size_t size, size_t* ten_percent, unsigned int seed1, unsigned int seed2, unsigned int columns);
4 | #endif
5 | 


--------------------------------------------------------------------------------
/bench/runner.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <stdbool.h>
  4 | #include <unistd.h>
  5 | #include <string.h>
  6 | #include "timer.h"
  7 | #include "generate.h"
  8 | 
  9 | static void print_help() {
 10 |     fprintf(stderr,"usage: bench [OPTIONS]\n");
 11 |     fprintf(stderr,"options:\n");
 12 |     fprintf(stderr, "-b 200\n");
 13 |     fprintf(stderr, "\tbench size in MBs\n");
 14 |     fprintf(stderr, "-e 2\n");
 15 |     fprintf(stderr, "\tenlarge bench size by repeating it x times\n");
 16 |     fprintf(stderr, "-c 6\n");
 17 |     fprintf(stderr, "\tcolumns to generate\n");
 18 |     fprintf(stderr, "-r 5\n");
 19 |     fprintf(stderr, "\tnumber of measure runs\n");
 20 |     fprintf(stderr, "-s 42\n");
 21 |     fprintf(stderr, "\tseed for random generator\n");
 22 |     fprintf(stderr, "-x\n");
 23 |     fprintf(stderr, "\tonly run the csvtools (used for comparing new commits)\n");
 24 |     fprintf(stderr, "-p\n");
 25 |     fprintf(stderr, "\toutput the generated data to stdout\n");
 26 | }
 27 | 
 28 | static void run(const char* restrict command, const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, double* results) {
 29 |     for (unsigned int r = 0; r < repeats; r++) {
 30 |         FILE* target = popen(command, "w");
 31 |         if (!target) {
 32 |             fprintf(stderr, "Can't start \"%s\"\n", command);
 33 |             results[r] = -1;
 34 |         }
 35 |         double start = getRealTime();
 36 |         for (unsigned int b = 0; b < buffer_copy; b++) {
 37 |             fwrite(buffer, sizeof(char), buffer_size, target);
 38 |             fflush(target);
 39 |         }
 40 |         if (pclose(target) != 0) {
 41 |             fprintf(stderr, "\"%s\" had an error.\n", command);
 42 |             results[r] = -1;
 43 |         }
 44 |         double stop = getRealTime();
 45 |         results[r] = (stop - start);
 46 |     }
 47 | }
 48 | 
 49 | /* base on source: nneonneo in http://stackoverflow.com/questions/12890008/replacing-character-in-a-string */
 50 | char *replace(const char *s, char ch, const char *repl) {
 51 |     int count = 0;
 52 |     for(const char* t=s; *t; t++)
 53 |         count += (*t == ch);
 54 | 
 55 |     size_t rlen = strlen(repl);
 56 |     char *res = malloc(strlen(s) + (rlen-1)*count + 1);
 57 |     char *ptr = res;
 58 |     for(const char* t=s; *t; t++) {
 59 |         if(*t == ch) {
 60 |             memcpy(ptr, repl, rlen);
 61 |             ptr += rlen;
 62 |         } else {
 63 |             *ptr++ = *t;
 64 |         }
 65 |     }
 66 |     *ptr = 0;
 67 |     return res;
 68 | }
 69 | 
 70 | int compare_double(const void *d1, const void *d2) { 
 71 |     return ( *(double*)d1 < *(double*)d2) ? 1 : -1 ; 
 72 | } 
 73 | 
 74 | static double median(double* data, size_t elements) {
 75 |     if (elements % 2 == 1) {
 76 |         return data[((elements + 1) / 2) - 1];
 77 |     }
 78 |     return (data[((elements + 1) / 2) - 1] + data[((elements + 1) / 2)]) / 2;
 79 | }
 80 | 
 81 | static double to_MBps(double n, size_t buffer_size, unsigned int buffer_copy) {
 82 |     return( (buffer_size * buffer_copy) / n) / (1024*1024);
 83 | }
 84 | 
 85 | static void print_run(const char* program, const char* name, const char* restrict command, const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats) {
 86 |     double* results = calloc(repeats, sizeof(double));
 87 |     run(command, buffer, buffer_size, buffer_copy, repeats, results);
 88 |     qsort(results, repeats, sizeof(double), compare_double);
 89 | 
 90 |     char* command_escaped = replace(command, '"', "\"\"");
 91 |     char* name_escaped = replace(name, '"', "\"\"");
 92 |     fprintf(stdout, "%s,\"%s\",\"%s\"", program, name_escaped, command_escaped);
 93 |     fprintf(stdout, ",%f,%f,%f", to_MBps(results[0], buffer_size, buffer_copy), to_MBps(results[repeats - 1], buffer_size, buffer_copy), to_MBps(median(results, repeats), buffer_size, buffer_copy));
 94 |     fprintf(stdout, "\n");
 95 |     free(command_escaped);
 96 |     free(name_escaped);
 97 |     free(results);
 98 | }
 99 | 
100 | 
101 | static void csvgrep_csvkit(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
102 |     (void)buffer_copy; // not used
103 |     fprintf(stderr, "Running csvkit csvgrep\n");
104 |     print_run("csvkit csvgrep", "first column", "python bench/csvkit-csvgrep.py -c column1 -r '.*[a-e]+.*' > /dev/null", buffer, buffer_size, 1, repeats);
105 | 
106 |     char command[255];
107 |     sprintf(command, "python bench/csvkit-csvgrep.py -c column%u -r '.*[a-e]+.*' > /dev/null", columns / 2);
108 |     print_run("csvkit csvgrep", "middle column", command, buffer, buffer_size, 1, repeats);
109 | 
110 |     sprintf(command, "python bench/csvkit-csvgrep.py -c column%u -r '.*[a-e]+.*' > /dev/null", columns);
111 |     print_run("csvkit csvgrep", "last column", command , buffer, buffer_size, 1, repeats);
112 | 
113 |     sprintf(command, "python bench/csvkit-csvgrep.py -c column%u,column%u -r '.*[a-e]+.*' > /dev/null", columns / 2, columns - 1);
114 |     print_run("csvkit csvgrep", "two columns", command , buffer, buffer_size, 1, repeats);
115 | }
116 | 
117 | static void csvgrep_awk(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
118 |     fprintf(stderr, "Running awk grep\n");
119 |     print_run("awk grep", "first column", "LC_ALL='C' awk -F\",\" '$1 ~ /.*[a-e]+.*/ { print }' > /dev/null", buffer, buffer_size, 1, repeats);
120 | 
121 |     char command[255];
122 |     sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ { print }' > /dev/null", columns / 2);
123 |     print_run("awk grep", "middle column", command, buffer, buffer_size, 1, repeats);
124 | 
125 |     sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ { print }' > /dev/null", columns);
126 |     print_run("awk grep", "last column", command , buffer, buffer_size, buffer_copy, repeats);
127 | 
128 |     sprintf(command, "LC_ALL='C' awk -F\",\" '$%u ~ /.*[a-e]+.*/ && $%u ~ /.*[F-L]+.*/ { print }' > /dev/null", columns / 2, columns - 1);
129 |     print_run("awk grep", "two columns", command , buffer, buffer_size, buffer_copy, repeats);
130 | }
131 | 
132 | static void csvgrep_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
133 |     fprintf(stderr, "Running csvtools csvgrep\n");
134 |     print_run("csvtools csvgrep", "first column", "bin/csvgrep -p 'column1/[a-e]+/' > /dev/null", buffer, buffer_size, buffer_copy, repeats);
135 | 
136 |     char command[255];
137 |     sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' > /dev/null", columns / 2);
138 |     print_run("csvtools csvgrep", "middle column", command, buffer, buffer_size, buffer_copy, repeats);
139 | 
140 |     sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' > /dev/null", columns);
141 |     print_run("csvtools csvgrep", "last column", command , buffer, buffer_size, buffer_copy, repeats);
142 | 
143 |     sprintf(command, "bin/csvgrep -p 'column%u/[a-e]+/' -p 'column%u/[F-L]+/' > /dev/null", columns / 2 , columns - 1);
144 |     print_run("csvtools csvgrep", "two columns", command , buffer, buffer_size, buffer_copy, repeats);
145 | }
146 | 
147 | static void repeat(char* restrict target, const char* restrict val, const char separator, size_t repeats) {
148 |     size_t val_length = strlen(val);
149 |     for (unsigned int r = 0; r < repeats; r++) {
150 |         if (r > 0) {
151 |             *target++ = separator;
152 |         }
153 |         memcpy(target, val, val_length);
154 |         target += val_length;
155 |     }
156 |     *target = '\0';
157 | }
158 | 
159 | static void csvgrep_gnugrep(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
160 |     fprintf(stderr, "Running gnu grep\n");
161 | 
162 |     print_run("gnutools grep", "first column", "LC_ALL='C' grep \"^[^,a-e]*[a-e][a-e]*\" > /dev/null", buffer, buffer_size, buffer_copy, repeats);
163 | 
164 |     char skip_commands[1024];
165 |     repeat(skip_commands, "[^,]*", ',', columns / 2);
166 |     char command[255];
167 |     sprintf(command, "LC_ALL='C' grep \"^%s,[^,a-e]*[a-e][a-e]*\" > /dev/null", skip_commands);
168 |     print_run("gnutools grep", "middle column", command, buffer, buffer_size, buffer_copy, repeats);
169 | 
170 |     print_run("gnutools grep", "last column", "LC_ALL='C' grep \"[a-e][a-e]*[^,a-e]*$\" > /dev/null", buffer, buffer_size, buffer_copy, repeats);
171 | 
172 |     char skip_commands2[1024];
173 |     repeat(skip_commands2, "[^,]*", ',', (columns - 1) - (columns / 2));
174 |     sprintf(command, "LC_ALL='C' grep \"^%s,[^,a-e]*[a-e][a-e]*[^,]*,%s,[^,F-L][F-L][F-L]*\" > /dev/null", skip_commands, skip_commands2);
175 |     print_run("gnutools grep", "two columns", command , buffer, buffer_size, buffer_copy, repeats);
176 | }
177 | 
178 | 
179 | static void csvcut_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
180 |     fprintf(stderr, "Running csvtools csvcut\n");
181 |     print_run("csvtools csvcut", "first column", "bin/csvcut -d column1 > /dev/null", buffer, buffer_size, buffer_copy, repeats);
182 | 
183 |     char command[255];
184 |     sprintf(command, "bin/csvcut -d column%u > /dev/null", columns / 2);
185 |     print_run("csvtools csvcut", "middle column", command, buffer, buffer_size, buffer_copy, repeats);
186 | 
187 |     sprintf(command, "bin/csvcut -d column%u > /dev/null", columns);
188 |     print_run("csvtools csvcut", "last column", command, buffer, buffer_size, buffer_copy, repeats);
189 | 
190 |     sprintf(command, "bin/csvcut -d column%u,column%u > /dev/null", columns - 3,columns - 2);
191 |     print_run("csvtools csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats);
192 | 
193 |     sprintf(command, "bin/csvcut -d column%u,column%u > /dev/null", columns / 2,columns - 1);
194 |     print_run("csvtools csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats);
195 | }
196 | 
197 | static void csvcut_csvkit(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
198 |     (void)buffer_copy; // not used
199 |     fprintf(stderr, "Running csvkit csvcut\n");
200 |     print_run("csvkit csvcut", "first column", "python bench/csvkit-csvcut.py -c column1 > /dev/null", buffer, buffer_size, 1, repeats);
201 | 
202 |     char command[255];
203 |     sprintf(command, "python bench/csvkit-csvcut.py -c column%u > /dev/null", columns / 2);
204 |     print_run("csvkit csvcut", "middle column", command, buffer, buffer_size, 1, repeats);
205 | 
206 |     sprintf(command, "python bench/csvkit-csvcut.py -c column%u > /dev/null", columns);
207 |     print_run("csvkit csvcut", "last column", command, buffer, buffer_size, 1, repeats);
208 | 
209 |     sprintf(command, "python bench/csvkit-csvcut.py -c column%u,column%u > /dev/null", columns - 3,columns - 2);
210 |     print_run("csvkit csvcut", "two adjoining column", command, buffer, buffer_size, 1, repeats);
211 | 
212 |     sprintf(command, "python bench/csvkit-csvcut.py -c column%u,column%u > /dev/null", columns / 2,columns - 1);
213 |     print_run("csvkit csvcut", "two distinct column", command, buffer, buffer_size, 1, repeats);
214 | }
215 | 
216 | static void csvcut_gnucut(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
217 |     fprintf(stderr, "Running gnu cut\n");
218 | 
219 |     char args[255];
220 |     args[0] = '\0';
221 |     size_t written = 0;
222 |     for (unsigned int col = 2; col <= columns; col++) {
223 |         sprintf(args + written, "%u", col);
224 |         written = strlen(args);
225 |         args[written++] = ',';
226 |     }
227 |     if (written > 0) {
228 |         args[written - 1] = '\0';
229 |     }
230 | 
231 |     char command[255];
232 |     sprintf(command, "cut -d ',' -f %s > /dev/null", args);
233 |     print_run("cut csvcut", "first column", command, buffer, buffer_size, 1, repeats);
234 | 
235 |     args[0] = '\0';
236 |     written = 0;
237 |     for (unsigned int col = 1; col <= columns; col++) {
238 |         if (col != columns / 2) {
239 |             sprintf(args + written, "%u", col);
240 |             written = strlen(args);
241 |             args[written++] = ',';
242 |         }
243 |     }
244 |     if (written > 0) {
245 |         args[written - 1] = '\0';
246 |     }
247 |     sprintf(command, "cut -d ',' -f %s > /dev/null", args);
248 |     print_run("cut csvcut", "middle column", command, buffer, buffer_size, 1, repeats);
249 | 
250 |     args[0] = '\0';
251 |     written = 0;
252 |     for (unsigned int col = 1; col <= columns - 1; col++) {
253 |         sprintf(args + written, "%u", col);
254 |         written = strlen(args);
255 |         args[written++] = ',';
256 |     }
257 |     if (written > 0) {
258 |         args[written - 1] = '\0';
259 |     }
260 |     sprintf(command, "cut -d ',' -f %s > /dev/null", args);
261 |     print_run("cut csvcut", "last column", command, buffer, buffer_size, 1, repeats);
262 | 
263 |     args[0] = '\0';
264 |     written = 0;
265 |     for (unsigned int col = 1; col <= columns; col++) {
266 |         if (col == columns - 3 || col == columns - 2) {
267 |             sprintf(args + written, "%u", col);
268 |             written = strlen(args);
269 |             args[written++] = ',';
270 |         }
271 |     }
272 |     if (written > 0) {
273 |         args[written - 1] = '\0';
274 |     }
275 |     sprintf(command, "cut -d ',' -f %s > /dev/null", args);
276 |     print_run("cut csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats);
277 | 
278 |     args[0] = '\0';
279 |     written = 0;
280 |     for (unsigned int col = 1; col <= columns; col++) {
281 |         if (col == columns / 2 || col == columns - 1) {
282 |             sprintf(args + written, "%u", col);
283 |             written = strlen(args);
284 |             args[written++] = ',';
285 |             args[written] = '\0';
286 |         }
287 |     }
288 |     if (written > 0) {
289 |         args[written - 1] = '\0';
290 |     }
291 |     sprintf(command, "cut -d ',' -f %s > /dev/null", args);
292 |     print_run("cut csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats);
293 | }
294 | 
295 | static void csvcut_sed(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
296 |     fprintf(stderr, "Running gnu sed\n");
297 | 
298 |     print_run("sed csvcut", "first column", "LC_ALL='C' sed 's/^[^,]*,//' > /dev/null", buffer, buffer_size, buffer_copy, repeats);
299 | 
300 |     char command[255];
301 |     sprintf(command, "LC_ALL='C' sed 's/[^,]*,//%u' > /dev/null", columns / 2);
302 |     print_run("sed csvcut", "middle column", command, buffer, buffer_size, buffer_copy, repeats);
303 | 
304 |     sprintf(command, "LC_ALL='C' sed 's/[^,]*,//%u' > /dev/null", columns - 1);
305 |     print_run("sed csvcut", "last column", command, buffer, buffer_size, buffer_copy, repeats);
306 | 
307 |     sprintf(command, "LC_ALL='C' sed -e 's/[^,]*,//%u' -e 's/[^,]*,//%u' > /dev/null", columns - 2, columns - 3 );
308 |     print_run("sed csvcut", "two adjoining column", command, buffer, buffer_size, buffer_copy, repeats);
309 | 
310 |     sprintf(command, "LC_ALL='C' sed -e 's/[^,]*,//%u' -e 's/[^,]*,//%u' > /dev/null", columns - 1, columns / 2);
311 |     print_run("sed csvcut", "two distinct column", command, buffer, buffer_size, buffer_copy, repeats);
312 | 
313 | }
314 | 
315 | 
316 | static void csvawk_csvtools(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
317 |     fprintf(stderr, "Running csvtools csvawk\n");
318 |     print_run("csvtools csvawk", "print second columnd", "LC_ALL='C' bin/csvawk  '{ print $2; }' > /dev/null", buffer, buffer_size, buffer_copy, repeats);
319 | 
320 |     char command[255];
321 |     sprintf(command, "LC_ALL='C' bin/csvawk 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns / 2);
322 |     print_run("csvtools csvawk", "sum middle column", command, buffer, buffer_size, buffer_copy, repeats);
323 | 
324 |     sprintf(command, "LC_ALL='C' bin/csvawk 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns);
325 |     print_run("csvtools csvawk", "sum last column", command, buffer, buffer_size, buffer_copy, repeats);
326 | }
327 | 
328 | static void csvawk_awkraw(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
329 |     fprintf(stderr, "Running raw awk\n");
330 |     print_run("raw awk", "print second columnd", "LC_ALL='C' awk  -F',' '{ print $2; }' > /dev/null", buffer, buffer_size, buffer_copy, repeats);
331 | 
332 |     char command[255];
333 |     sprintf(command, "LC_ALL='C' awk -F',' 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns / 2);
334 |     print_run("raw awk", "sum middle column", command, buffer, buffer_size, buffer_copy, repeats);
335 | 
336 |     sprintf(command, "LC_ALL='C' awk -F',' 'BEGIN { s = 0; } { s += $%u; } END { print s; }' > /dev/null", columns);
337 |     print_run("raw awk", "sum last column", command, buffer, buffer_size, buffer_copy, repeats);
338 | }
339 | 
340 | static void csvawk_awkcsvparser(const char* restrict buffer, size_t buffer_size, unsigned int buffer_copy, unsigned int repeats, unsigned int columns) {
341 |     fprintf(stderr, "Running awk csvparser\n");
342 |     print_run("csvparser awk", "print second columnd", "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source '{ csv_parse_record($0, separator, enclosure, csv); print csv[1]; }' > /dev/null", buffer, buffer_size, 1, repeats);
343 | 
344 |     char command[255];
345 |     sprintf(command, "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source 'BEGIN {s = 0; }{ csv_parse_record($0, separator, enclosure, csv); s += csv[%u]; } END { print s; }' > /dev/null", (columns / 2) - 1);
346 |     print_run("csvparser awk", "sum middle column", command, buffer, buffer_size, 1, repeats);
347 | 
348 |     sprintf(command, "LC_ALL='C' awk -f bench/deps/awk-csv-parser/src/csv-parser.awk -v separator=',' -v enclosure='\"' --source 'BEGIN {s = 0; }{ csv_parse_record($0, separator, enclosure, csv); s += csv[%u]; } END { print s; }' > /dev/null", columns - 1);
349 |     print_run("csvparser awk", "sum last column", command, buffer, buffer_size, 1, repeats);
350 | }
351 | 
352 | 
353 | // based on xxhash avalanche
354 | #define PRIME1   2654435761U
355 | #define PRIME2   2246822519U
356 | #define PRIME3   3266489917U
357 | 
358 | static unsigned int xxh_mix(unsigned int x, unsigned int seed) {
359 |     unsigned int crc = x  + seed + PRIME1;
360 |     crc ^= crc >> 15;
361 |     crc *= PRIME2;
362 |     crc ^= crc >> 13;
363 |     crc *= PRIME3;
364 |     crc ^= crc >> 16;
365 |     return crc;
366 | }
367 | 
368 | int main(int argc, char** argv) {
369 |     size_t bench_size = 200 * 1024 * 1024;
370 |     unsigned int columns = 6;
371 |     unsigned int repeats = 5;
372 |     unsigned int bench_copy = 2;
373 |     unsigned int seed1 = xxh_mix(29, 42);
374 |     unsigned int seed2 = xxh_mix(13, 11);
375 |     bool only_csvtools = false;
376 |     bool output_stdout = false;
377 | 
378 |     char c;
379 |     while ((c = getopt (argc, argv, "b:c:r:e:s:xph")) != -1) {
380 |         switch (c) {
381 |             case 'b':
382 |                 sscanf(optarg, "%zu", &bench_size);
383 |                 bench_size *= 1024 * 1024;
384 |                 break;
385 |             case 'c':
386 |                 sscanf(optarg, "%u", &columns);
387 |                 break;
388 |             case 'r':
389 |                 sscanf(optarg, "%u", &repeats);
390 |                 break;
391 |             case 'e':
392 |                 sscanf(optarg, "%u", &bench_copy);
393 |                 break;
394 |             case 's':
395 |                 sscanf(optarg, "%u", &seed1);
396 |                 seed1 = xxh_mix(seed1, 42);
397 |                 break;
398 |             case 'x':
399 |                 only_csvtools = true;
400 |                 break;
401 |             case 'p':
402 |                 output_stdout = true;
403 |                 break;
404 |             case '?':
405 |             case 'h':
406 |             default:
407 |                 print_help();
408 |                 exit(1);
409 |                 break;
410 |         }
411 |     }
412 |     char* buffer = calloc(bench_size, sizeof(char));
413 |     fprintf(stderr, "Preparing data (%zu bytes)\n",bench_size);
414 |     size_t data_filled_small;
415 |     size_t data_filled = generate_csv(buffer, bench_size, &data_filled_small, seed1, seed2, columns);
416 |     fprintf(stderr, "Data ready (%zu bytes)\n",data_filled);
417 |     if (output_stdout) {
418 |         for (unsigned int b = 0; b < bench_copy; b++) {
419 |             fwrite(buffer, sizeof(char), data_filled, stdout);
420 |             fflush(stdout);
421 |         }
422 |         return 0;
423 |     }
424 | 
425 |     fprintf(stdout, "program,name,command,min speed,max speed,median speed");
426 |     fprintf(stdout, "\n");
427 | 
428 |     fprintf(stderr, "Running pipe bench fist\n");
429 |     if (!only_csvtools) {
430 |         print_run("bench pipe", "cat", "cat > /dev/null", buffer, data_filled, bench_copy, repeats);
431 |         print_run("bench pipe", "wc -l",  "wc -l > /dev/null", buffer, data_filled, bench_copy, repeats);
432 |         print_run("bench pipe", "md5sum", "openssl md5 > /dev/null", buffer, data_filled, bench_copy, repeats);
433 |     }
434 | 
435 |     csvgrep_csvtools(buffer, data_filled, bench_copy, repeats, columns);
436 |     if (!only_csvtools) {
437 |         csvgrep_csvkit(buffer, data_filled_small, bench_copy, repeats, columns);
438 |         csvgrep_awk(buffer, data_filled, bench_copy, repeats, columns);
439 |         csvgrep_gnugrep(buffer, data_filled, bench_copy, repeats, columns);
440 |     }
441 | 
442 |     csvcut_csvtools(buffer, data_filled, bench_copy, repeats, columns);
443 |     if (!only_csvtools) {
444 |         csvcut_csvkit(buffer, data_filled_small, bench_copy, repeats, columns);
445 |         csvcut_gnucut(buffer, data_filled, bench_copy, repeats, columns);
446 |         csvcut_sed(buffer, data_filled, bench_copy, repeats, columns);
447 |     }
448 | 
449 |     csvawk_csvtools(buffer, data_filled, bench_copy, repeats / 3, columns);
450 |     if (!only_csvtools) {
451 |         csvawk_awkraw(buffer, data_filled, bench_copy, repeats / 3, columns);
452 |         csvawk_awkcsvparser(buffer, data_filled_small, bench_copy, repeats / 3, columns);
453 |     }
454 | 
455 | 
456 |     return 0;
457 | }
458 | 


--------------------------------------------------------------------------------
/bench/timer.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Author:  David Robert Nadeau
  3 |  * Site:    http://NadeauSoftware.com/
  4 |  * License: Creative Commons Attribution 3.0 Unported License
  5 |  *          http://creativecommons.org/licenses/by/3.0/deed.en_US
  6 |  */
  7 | #ifndef _TIMER_H
  8 | #define _TIMER_H
  9 | #if defined(_WIN32)
 10 | #include <Windows.h>
 11 | 
 12 | #elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
 13 | #include <unistd.h>	/* POSIX flags */
 14 | #include <time.h>	/* clock_gettime(), time() */
 15 | #include <sys/time.h>	/* gethrtime(), gettimeofday() */
 16 | 
 17 | #if defined(__MACH__) && defined(__APPLE__)
 18 | #include <mach/mach.h>
 19 | #include <mach/mach_time.h>
 20 | #endif
 21 | 
 22 | #else
 23 | #error "Unable to define getRealTime( ) for an unknown OS."
 24 | #endif
 25 | 
 26 | 
 27 | 
 28 | 
 29 | 
 30 | /**
 31 |  * Returns the real time, in seconds, or -1.0 if an error occurred.
 32 |  *
 33 |  * Time is measured since an arbitrary and OS-dependent start time.
 34 |  * The returned real time is only useful for computing an elapsed time
 35 |  * between two calls to this function.
 36 |  */
 37 | double getRealTime( )
 38 | {
 39 | #if defined(_WIN32)
 40 | 	FILETIME tm;
 41 | 	ULONGLONG t;
 42 | #if defined(NTDDI_WIN8) && NTDDI_VERSION >= NTDDI_WIN8
 43 | 	/* Windows 8, Windows Server 2012 and later. ---------------- */
 44 | 	GetSystemTimePreciseAsFileTime( &tm );
 45 | #else
 46 | 	/* Windows 2000 and later. ---------------------------------- */
 47 | 	GetSystemTimeAsFileTime( &tm );
 48 | #endif
 49 | 	t = ((ULONGLONG)tm.dwHighDateTime << 32) | (ULONGLONG)tm.dwLowDateTime;
 50 | 	return (double)t / 10000000.0;
 51 | 
 52 | #elif (defined(__hpux) || defined(hpux)) || ((defined(__sun__) || defined(__sun) || defined(sun)) && (defined(__SVR4) || defined(__svr4__)))
 53 | 	/* HP-UX, Solaris. ------------------------------------------ */
 54 | 	return (double)gethrtime( ) / 1000000000.0;
 55 | 
 56 | #elif defined(__MACH__) && defined(__APPLE__)
 57 | 	/* OSX. ----------------------------------------------------- */
 58 | 	static double timeConvert = 0.0;
 59 | 	if ( timeConvert == 0.0 )
 60 | 	{
 61 | 		mach_timebase_info_data_t timeBase;
 62 | 		(void)mach_timebase_info( &timeBase );
 63 | 		timeConvert = (double)timeBase.numer /
 64 | 			(double)timeBase.denom /
 65 | 			1000000000.0;
 66 | 	}
 67 | 	return (double)mach_absolute_time( ) * timeConvert;
 68 | 
 69 | #elif defined(_POSIX_VERSION)
 70 | 	/* POSIX. --------------------------------------------------- */
 71 | #if defined(_POSIX_TIMERS) && (_POSIX_TIMERS > 0)
 72 | 	{
 73 | 		struct timespec ts;
 74 | #if defined(CLOCK_MONOTONIC_PRECISE)
 75 | 		/* BSD. --------------------------------------------- */
 76 | 		const clockid_t id = CLOCK_MONOTONIC_PRECISE;
 77 | #elif defined(CLOCK_MONOTONIC_RAW)
 78 | 		/* Linux. ------------------------------------------- */
 79 | 		const clockid_t id = CLOCK_MONOTONIC_RAW;
 80 | #elif defined(CLOCK_HIGHRES)
 81 | 		/* Solaris. ----------------------------------------- */
 82 | 		const clockid_t id = CLOCK_HIGHRES;
 83 | #elif defined(CLOCK_MONOTONIC)
 84 | 		/* AIX, BSD, Linux, POSIX, Solaris. ----------------- */
 85 | 		const clockid_t id = CLOCK_MONOTONIC;
 86 | #elif defined(CLOCK_REALTIME)
 87 | 		/* AIX, BSD, HP-UX, Linux, POSIX. ------------------- */
 88 | 		const clockid_t id = CLOCK_REALTIME;
 89 | #else
 90 | 		const clockid_t id = (clockid_t)-1;	/* Unknown. */
 91 | #endif /* CLOCK_* */
 92 | 		if ( id != (clockid_t)-1 && clock_gettime( id, &ts ) != -1 )
 93 | 			return (double)ts.tv_sec +
 94 | 				(double)ts.tv_nsec / 1000000000.0;
 95 | 		/* Fall thru. */
 96 | 	}
 97 | #endif /* _POSIX_TIMERS */
 98 | 
 99 | 	/* AIX, BSD, Cygwin, HP-UX, Linux, OSX, POSIX, Solaris. ----- */
100 | 	struct timeval tm;
101 | 	gettimeofday( &tm, NULL );
102 | 	return (double)tm.tv_sec + (double)tm.tv_usec / 1000000.0;
103 | #else
104 | 	return -1.0;		/* Failed. */
105 | #endif
106 | }
107 | #endif
108 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 


--------------------------------------------------------------------------------
/src/csv_tokenizer.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdbool.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <assert.h>
  6 | #include <limits.h>
  7 | #include <string.h>
  8 | #include "debug.h"
  9 | #include "csv_tokenizer.h"
 10 | 
 11 | #if defined(_GNU_SOURCE) && !defined(SLOW_PATH)
 12 |     #define FAST_GNU_LIBC
 13 | #endif
 14 | 
 15 | enum tokenizer_state {
 16 |     FRESH,
 17 |     PREV_NEWLINE,
 18 |     PREV_CELL,
 19 |     PREV_QUOTE,
 20 |     IN_QUOTE
 21 | };
 22 | 
 23 | 
 24 | struct csv_tokenizer {
 25 |     const char* restrict buffer;
 26 |     Cell* restrict cells;
 27 |     Cell const* restrict cells_end;
 28 | #ifdef FAST_GNU_LIBC
 29 |     char scan_mask[4];
 30 | #endif
 31 | 
 32 |     unsigned long long records_processed;
 33 | 
 34 |     char separator;
 35 | 
 36 |     enum tokenizer_state state;
 37 | };
 38 | 
 39 | struct csv_tokenizer* setup_tokenizer(char separator, const char* restrict buffer, Cell* restrict cells, size_t cell_size) {
 40 |     struct csv_tokenizer* tokenizer = malloc(sizeof(struct csv_tokenizer));
 41 |     tokenizer->separator = separator;
 42 |     tokenizer->buffer = buffer;
 43 |     tokenizer->cells = cells;
 44 |     tokenizer->cells_end = cells + cell_size - 2; // two room at the end
 45 |     assert(tokenizer->cells < tokenizer->cells_end);
 46 | #ifdef FAST_GNU_LIBC
 47 |     tokenizer->scan_mask[0] = '\r';
 48 |     tokenizer->scan_mask[1] = '\n';
 49 |     tokenizer->scan_mask[2] = separator;
 50 |     tokenizer->scan_mask[3] = '\0';
 51 | #endif
 52 |     tokenizer->records_processed = 0;
 53 |     tokenizer->state = FRESH;
 54 |     return tokenizer;
 55 | }
 56 | 
 57 | void free_tokenizer(struct csv_tokenizer* restrict tokenizer) {
 58 |     free(tokenizer);
 59 | }
 60 | 
 61 | static void print_current_line(const char* restrict current_char,const char* restrict buffer_start, const char* restrict buffer_end) {
 62 |     const char* restrict start = current_char;
 63 |     const char* restrict end = current_char;
 64 | 
 65 |     // find surround newlines
 66 |     while (--start > buffer_start  && *start != '\n' && *start != '\r');
 67 |     start++;
 68 |     while (++end < buffer_end  && *end != '\n' && *end != '\r');
 69 |     end--;
 70 | 
 71 |     // copy string such that we can put a \0 at the end
 72 |     size_t line_length = end-start;
 73 |     char* printable_string = calloc(sizeof(char), line_length + 1);
 74 |     memcpy(printable_string, start, line_length);
 75 |     printable_string[line_length] = '\0';
 76 |     fprintf(stderr, "Current line: %s\n", printable_string);
 77 |     free(printable_string);
 78 | }
 79 | 
 80 | void prepare_tokenization(struct csv_tokenizer* restrict tokenizer, char* restrict buffer, size_t buffer_read) {
 81 |     buffer[buffer_read] = '\0';
 82 |     buffer[buffer_read + 1] = tokenizer->separator;
 83 |     buffer[buffer_read + 2] = '\r';
 84 |     buffer[buffer_read + 3] = '"';
 85 | }
 86 | 
 87 | void tokenize_cells(struct csv_tokenizer* restrict tokenizer, size_t buffer_offset, size_t buffer_read, size_t* restrict buffer_consumed, size_t* restrict cells_found, bool* restrict last_full) {
 88 |     const char* restrict current_char = tokenizer->buffer + buffer_offset;
 89 |     const char* restrict char_end = tokenizer->buffer + buffer_read;
 90 |     const char* restrict current_start = current_char;
 91 | 
 92 | 
 93 | #ifndef FAST_GNU_LIBC
 94 |     assert(CHAR_BIT == 8);
 95 |     bool cell_delimitor[256];
 96 |     memset(cell_delimitor, false, sizeof(bool) * 256);
 97 |     cell_delimitor[(unsigned char)tokenizer->separator] = true;
 98 |     cell_delimitor[(unsigned char)'\n'] = true;
 99 |     cell_delimitor[(unsigned char)'\r'] = true;
100 | #endif
101 | 
102 |     Cell* restrict cell = tokenizer->cells;
103 |     LOG_V("tokenizer-start\t%d %c (%lu)\n", tokenizer->state, *current_char, buffer_offset );
104 | 
105 |     *last_full = true;
106 |     enum tokenizer_state old_state = tokenizer->state;
107 |     tokenizer->state = FRESH;
108 |     switch (old_state) {
109 |     case PREV_QUOTE:
110 |         if (*current_char == '"') {
111 |             // escaped quote so we don't have to decrease the first char
112 |             goto IN_QUOTE;
113 |         }
114 |         current_char--; // jump back, since starts with increment
115 |         goto AFTER_QUOTE;
116 | 
117 |     case IN_QUOTE:
118 |         current_char--; // jump back, since the loops starts with increment
119 |         goto IN_QUOTE;
120 | 
121 |     case PREV_NEWLINE:
122 |         if ((*current_char == '\n' || *current_char == '\r')) {
123 |             while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r'));
124 |         }
125 |         break;
126 | 
127 |     case PREV_CELL:
128 |         current_char--; // jump back, since the loops starts with increment
129 |         goto NORMAL_CELL;
130 | 
131 |     default:
132 |         if (*current_char == tokenizer->separator) {
133 |             cell->start = current_start;
134 |             cell->length = 0;
135 |             cell++;
136 |             current_char++;
137 |             current_start = current_char;
138 |         }
139 |     }
140 | 
141 |     while (current_char < char_end) {
142 |         if (*current_char == '"') {
143 | IN_QUOTE:;
144 |             while(++current_char < char_end) {
145 | #ifdef _GNU_SOURCE
146 |                 current_char = rawmemchr(current_char, '"');
147 | #else
148 |                 current_char = memchr(current_char, '"', char_end - current_char);
149 | #endif
150 |                 if (current_char == NULL || current_char > char_end) {
151 |                     // end of stream reached before end of cell found
152 |                     current_char = char_end;
153 |                     break;
154 |                 }
155 |                 else {
156 |                     const char* peek = current_char + 1;
157 |                     assert(peek <= char_end);
158 |                     if (peek == char_end) {
159 |                         // at the end of stream and not sure if escaped or not
160 |                         tokenizer->state = PREV_QUOTE;
161 |                         *last_full = false;
162 |                         break;
163 |                     }
164 |                     else if (*peek == '"') {
165 |                         current_char++;
166 |                         continue;
167 |                     }
168 |                     else {
169 |                         break;
170 |                     }
171 |                 }
172 |             }
173 | AFTER_QUOTE:
174 |             if (current_char != char_end) {
175 |                 current_char++;
176 |             }
177 |             cell->start = current_start;
178 |             cell->length = (size_t)((current_char)-current_start);
179 |             cell++;
180 | 
181 |             if (current_char == char_end) {
182 |                 if (*(current_char-1) != '"' || *(current_char-2) == '"' ||  current_char - 1 == current_start) {
183 |                     if (tokenizer->state == FRESH) {
184 |                         tokenizer->state = IN_QUOTE;
185 |                     }
186 |                     *last_full = false;
187 |                     break;
188 |                 }
189 |                 *last_full = false; // is this correct? does it ever happen?
190 |                 break;
191 |             }
192 | 
193 |             if (*current_char == '\n' || *current_char == '\r') {
194 |                 cell->start = NULL;
195 |                 cell->length = -1;
196 |                 cell++;
197 |                 tokenizer->records_processed++;
198 |                 // consume newline
199 |                 while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r'));
200 |                 if (current_char == char_end) {
201 |                     // we stopped inside a new_line
202 |                     tokenizer->state = PREV_NEWLINE;
203 |                     break;
204 |                 }
205 |                 current_start = current_char;
206 |             }
207 |             else if (*current_char == tokenizer->separator) {
208 |                 current_char++;
209 |                 current_start = current_char;
210 |             }
211 |             else {
212 |                 fprintf(stderr, "Invalid character: \"%c (\\%d)\" found after end of cell (after the %lluth record)\n",*current_char, *current_char,tokenizer->records_processed);
213 |                 print_current_line(current_char, tokenizer->buffer, char_end);
214 |                 exit(1);
215 |                 return;
216 |             }
217 |             if (cell >= tokenizer->cells_end) {
218 |                 break;
219 |             }
220 |         }
221 |         else if (*current_char == tokenizer->separator) {
222 |             // an empty cell somewhere in the middle
223 |             cell->start = current_start;
224 |             cell->length = 0;
225 |             cell++;
226 |             current_start = ++current_char;
227 |             if (cell >= tokenizer->cells_end) {
228 |                 break;
229 |             }
230 |         }
231 |         else if (*current_char == '\n' || *current_char == '\r') {
232 |             // an newline means that we had an empty cell as last cell of the
233 |             // row
234 |             cell->start = current_start;
235 |             cell->length = 0;
236 |             cell++;
237 | 
238 |             cell->start = NULL;
239 |             cell->length = -1;
240 |             cell++;
241 |             tokenizer->records_processed++;
242 |             // consume newline
243 |             while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r'));
244 |             if (current_char == char_end) {
245 |                 // we stopped inside a new_line
246 |                 tokenizer->state = PREV_NEWLINE;
247 |                 break;
248 |             }
249 |             current_start = current_char;
250 |             if (cell >= tokenizer->cells_end) {
251 |                 break;
252 |             }
253 |         }
254 |         else {
255 |             // start of a new field
256 | NORMAL_CELL:;
257 | #ifdef FAST_GNU_LIBC
258 |             do {
259 |                 current_char++;
260 |                 current_char += strcspn(current_char, tokenizer->scan_mask);
261 |             } while (current_char < char_end && *current_char == '\0'); // strspn stops at 0 chars.
262 | #else
263 |             while (true) {
264 |                 if (cell_delimitor[(unsigned char)current_char[1]]) {
265 |                     current_char += 1;
266 |                     goto FOUND_CELL_END;
267 |                 }
268 |                 if (cell_delimitor[(unsigned char)current_char[2]]) {
269 |                     current_char += 2;
270 |                     goto FOUND_CELL_END;
271 |                 }
272 |                 if (cell_delimitor[(unsigned char)current_char[3]]) {
273 |                     current_char += 3;
274 |                     goto FOUND_CELL_END;
275 |                 }
276 |                 current_char += 4;
277 |                 if (cell_delimitor[(unsigned char)current_char[0]]) {
278 |                     goto FOUND_CELL_END;
279 |                 }
280 |             }
281 | FOUND_CELL_END:;
282 | #endif
283 |             if (current_char > char_end) {
284 |                 current_char = char_end;
285 |             }
286 |             cell->start = current_start;
287 |             cell->length = (size_t)((current_char)-current_start);
288 |             cell++;
289 | 
290 |             if (current_char == char_end) {
291 |                 if (*(current_char-1) != tokenizer->separator) {
292 |                     tokenizer->state = PREV_CELL;
293 |                     *last_full = false;
294 |                     break;
295 |                 }
296 |             }
297 |             else if (*current_char == '\n' || *current_char == '\r') {
298 |                 cell->start = NULL;
299 |                 cell->length = -1;
300 |                 cell++;
301 |                 tokenizer->records_processed++;
302 |                 // consume newline
303 |                 while (++current_char < char_end && (*current_char == '\n' || *current_char == '\r'));
304 |                 if (current_char == char_end) {
305 |                     // we stopped inside a new_line
306 |                     tokenizer->state = PREV_NEWLINE;
307 |                     break;
308 |                 }
309 |                 current_start = current_char;
310 |             }
311 |             else if (*current_char == tokenizer->separator) {
312 |                 current_char++;
313 |                 current_start = current_char;
314 |             }
315 |             else {
316 |                 fprintf(stderr, "Invalid character: \"%c (\\%d)\" found after end of cell (after the %lluth record)\n",*current_char, *current_char,tokenizer->records_processed);
317 |                 print_current_line(current_char, tokenizer->buffer, char_end);
318 |                 exit(1);
319 |                 return;
320 |             }
321 |             if (cell >= tokenizer->cells_end) {
322 |                 break;
323 |             }
324 |         }
325 |     }
326 |     *buffer_consumed = (size_t)(current_char - tokenizer->buffer);
327 |     *cells_found = (size_t)(cell - tokenizer->cells);
328 | 
329 |     LOG_V("tokenizer-done\t%d, %c (%lu) %d\n", tokenizer->state,  *(current_char-1), *buffer_consumed  , *last_full);
330 | }
331 | 
332 | 


--------------------------------------------------------------------------------
/src/csv_tokenizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stddef.h>
 3 | #include <stdbool.h>
 4 | 
 5 | // make sure the buffer passed is actually this amount bigger
 6 | #define BUFFER_TOKENIZER_POSTFIX 4
 7 | 
 8 | struct csv_tokenizer;
 9 | typedef struct {
10 |     char const * restrict start;
11 |     size_t length;
12 | } Cell;
13 | 
14 | void prepare_tokenization(struct csv_tokenizer* restrict tokenizer, char* restrict buffer, size_t buffer_read);
15 | 
16 | struct csv_tokenizer* setup_tokenizer(char separator, const char* restrict buffer, Cell* restrict cells, size_t cell_size);
17 | void tokenize_cells(struct csv_tokenizer* restrict tokenizer, size_t buffer_offset, size_t buffer_read, size_t* restrict buffer_consumed, size_t* restrict cells_found, bool* restrict last_full);
18 | void free_tokenizer(struct csv_tokenizer* restrict tokenizer);
19 | 


--------------------------------------------------------------------------------
/src/csvawk.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <unistd.h>
  5 | #include <limits.h>     
  6 | #include <stdbool.h>
  7 | #include <assert.h>
  8 | #include <string.h>
  9 | #include "debug.h"
 10 | #include "hints.h"
 11 | 
 12 | #define AWK_ROW_SEPARATOR '\x1E'
 13 | #define AWK_CELL_SEPARATOR '\x1F'
 14 | 
 15 | #if defined(_GNU_SOURCE) && !defined(SLOW_PATH)
 16 |     #define FAST_GNU_LIBC
 17 | #endif
 18 | 
 19 | static char _buffer[BUFFER_SIZE + 2];
 20 | 
 21 | static struct {
 22 |     FILE* source;
 23 |     char separator;
 24 |     bool drop_header;
 25 | #ifdef FAST_GNU_LIBC
 26 |     char scan_mask[4];
 27 | #endif
 28 |     char* script;
 29 |     FILE* target;
 30 | } config;
 31 | 
 32 | static void start_awk();
 33 | static void parse_config(int argc, char** argv);
 34 | static void do_pipe(size_t chars_read);
 35 | int main(int argc, char** argv) {
 36 |     parse_config(argc, argv);
 37 | 
 38 |     if (config.target != stdout) {
 39 |         start_awk();
 40 |     }
 41 | 
 42 |     size_t chars_read;
 43 |     SEQUENTIAL_HINT(config.source);
 44 |     while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, config.source)) > 0) {
 45 |         _buffer[chars_read] = '\0';
 46 |         _buffer[chars_read+1] = '\"';
 47 |         do_pipe(chars_read);
 48 |     }
 49 |     if (config.source != stdin) {
 50 |         fclose(config.source);
 51 |     }
 52 |     if (config.target != stdout) {
 53 |         pclose(config.target);
 54 |     }
 55 |     return 0;
 56 | }
 57 | 
 58 | static void print_help() {
 59 |     fprintf(stderr,"usage: csvawk [OPTIONS] AWKSCRIPT [FILE]");
 60 |     fprintf(stderr,"options:");
 61 |     fprintf(stderr, "-s ,\n");
 62 |     fprintf(stderr, "  Which character to use as separator (default is ,)\n");
 63 |     fprintf(stderr, "-d\n");
 64 |     fprintf(stderr, "  drop header row\n");
 65 | }
 66 | 
 67 | static void parse_config(int argc, char** argv) {
 68 |     config.source = stdin;
 69 |     config.separator = ',';
 70 |     config.drop_header = false;
 71 | 
 72 |     char c;
 73 |     while ((c = getopt (argc, argv, "s:dp")) != -1) {
 74 |         switch (c) {
 75 |             case 's': 
 76 |                 config.separator = optarg[0];
 77 |                 break;
 78 |             case 'd':
 79 |                 config.drop_header = true;
 80 |                 break;
 81 |             case 'p':
 82 |                 config.target = stdout;
 83 |                 break;
 84 |             case '?':
 85 |             case 'h':
 86 |                 print_help();
 87 |                 exit(1);
 88 |                 break;
 89 |         }
 90 |     }
 91 |     int args_left = argc - optind;
 92 |     switch(args_left) {
 93 |         case 0:
 94 |             fprintf(stderr, "Missing AWK script\n");
 95 |             print_help();
 96 |             exit(1);
 97 |             break;
 98 |         case 2:
 99 |             config.source = fopen(argv[argc - 1], "r");
100 |             if (!config.source) {
101 |                 fprintf(stderr, "Could not open file %s for reading\n", argv[optind]);
102 |                 exit(1);
103 |             }
104 |             // fall through
105 |         case 1:
106 |             config.script = argv[optind];
107 |             break;
108 |         default:
109 |             if (args_left > 2) {
110 |                 fprintf(stderr, "Too many arguments\n");
111 |             }
112 |             else {
113 |                 fprintf(stderr, "Missing AWK script\n");
114 |             }
115 |             print_help();
116 |             exit(1);
117 |             break;
118 |     }
119 | 
120 | #ifdef FAST_GNU_LIBC
121 |     config.scan_mask[0] = '\r';
122 |     config.scan_mask[1] = '\n';
123 |     config.scan_mask[2] = config.separator;
124 |     config.scan_mask[3] = '\"';
125 | #endif
126 | 
127 |     LOG_D("%s\n","Done parsing config params");    
128 | }
129 | 
130 | enum tokenizer_state {
131 |     FRESH,
132 |     PREV_NEWLINE,
133 |     PREV_QUOTE,
134 |     IN_QUOTE,
135 | };
136 | 
137 | static bool first_run = true;
138 | static enum tokenizer_state _state = FRESH;
139 | 
140 | static void do_pipe(size_t chars_read) {
141 |     char* restrict current_char = _buffer;
142 |     char const* restrict char_end = _buffer + chars_read;
143 |     char const* restrict current_start = _buffer;
144 |     LOG_V("Piping: %zu state: %d first char: %c\n", chars_read, _state, *current_char);
145 | 
146 | #ifndef FAST_GNU_LIBC
147 |     assert(CHAR_BIT == 8);
148 |     bool cell_delimitor[256];
149 |     memset(cell_delimitor, false, sizeof(bool) * 256);
150 |     cell_delimitor[(unsigned char)config.separator] = true;
151 |     cell_delimitor[(unsigned char)'\n'] = true;
152 |     cell_delimitor[(unsigned char)'\r'] = true;
153 |     cell_delimitor[(unsigned char)'\"'] = true;
154 | #endif
155 | 
156 |     if (config.drop_header && first_run) {
157 |         while (current_char < char_end) {
158 |             if (*current_char == '\n' || *current_char == '\r') {
159 |                 if (*current_char == '\r') {
160 |                     _state = PREV_NEWLINE; // handle the windows newlines correctly
161 |                 }
162 |                 current_start = ++current_char;
163 |                 first_run = false;
164 |                 break;
165 |             }
166 |             current_char++;
167 |         }
168 |         if (current_char == char_end) {
169 |             return;
170 |         }
171 |     }
172 | 
173 |     switch(_state) {
174 |         case PREV_QUOTE:
175 |             _state = FRESH; // reset state
176 |             if (*current_char == '"') {
177 |                 // we have two quotes
178 |                 // one in the previous block, one in the current
179 |                 goto IN_QUOTE;
180 |             }
181 |             // we were at the end of the quoted cell, so let's continue
182 |             break;
183 |         case IN_QUOTE:
184 |             current_char--; // the loop starts with a increment
185 |             goto IN_QUOTE;
186 |         case PREV_NEWLINE:
187 |             if (*current_char == '\n') {
188 |                 // we already had a newline, so lets eat this second windows
189 |                 // newline
190 |                 current_char++;
191 |                 current_start++;
192 |             }
193 |             _state = FRESH;
194 |             break;
195 |         default:
196 |             break;
197 |     }
198 | 
199 |     while (current_char < char_end) {
200 |         if (*current_char == '"') {
201 | IN_QUOTE:
202 |             while (++current_char < char_end) {
203 |                 if (*current_char == '"') {
204 |                     char const* peek = current_char + 1;
205 |                     if (peek == char_end) {
206 |                         current_char++;
207 |                         _state = PREV_QUOTE;
208 |                         // at the end of stream and not sure if escaped or not
209 |                         break;
210 |                     }
211 |                     else if (*peek == '"') {
212 |                         current_char++;
213 |                         continue;
214 |                     }
215 |                     else {
216 |                         break;
217 |                     }
218 |                 }
219 |             }
220 |             if (current_char == char_end) {
221 |                 // we are at the end, let's write everything we've seen
222 |                 if (_state != PREV_QUOTE) {
223 |                     _state = IN_QUOTE;
224 |                 }
225 |                 break;
226 |             }
227 |             else {
228 |                 current_char++;
229 |                 _state = FRESH;
230 |             }
231 |         }
232 |         else if (*current_char == '\n') {
233 |             *current_char = AWK_ROW_SEPARATOR;
234 |             current_char++;
235 |         }
236 |         else if (*current_char == '\r') {
237 |             *current_char = AWK_ROW_SEPARATOR;
238 |             current_char++;
239 |             if (current_char == char_end) {
240 |                 _state = PREV_NEWLINE;
241 |                 break;
242 |             }
243 |             else if (*current_char == '\n') {
244 |                 // we have windows new lines, so lets skip over this byte
245 |                 fwrite(current_start, sizeof(char), current_char - current_start, config.target);
246 |                 current_char++;
247 |                 current_start = current_char;
248 |             }
249 |         }
250 |         else if (*current_char == config.separator) {
251 |             *current_char = AWK_CELL_SEPARATOR;
252 |             current_char++;
253 |         }
254 |         else {
255 |             // all other chars, just skip until we find another interesting character
256 | #ifdef FAST_GNU_LIBC
257 |             do {
258 |                 current_char++;
259 |                 current_char += strcspn(current_char, config.scan_mask);
260 |             } while (current_char < char_end && *current_char == '\0'); // strspn stops at 0 chars.
261 | #else
262 |             while (true) {
263 |                 if (cell_delimitor[(unsigned char)current_char[1]]) {
264 |                     current_char += 1;
265 |                     goto FOUND_CELL_END;
266 |                 }
267 |                 if (cell_delimitor[(unsigned char)current_char[2]]) {
268 |                     current_char += 2;
269 |                     goto FOUND_CELL_END;
270 |                 }
271 |                 if (cell_delimitor[(unsigned char)current_char[3]]) {
272 |                     current_char += 3;
273 |                     goto FOUND_CELL_END;
274 |                 }
275 |                 current_char += 4;
276 |                 if (cell_delimitor[(unsigned char)current_char[0]]) {
277 |                     goto FOUND_CELL_END;
278 |                 }
279 |             }
280 | FOUND_CELL_END:;
281 | #endif
282 |             while (current_char > char_end) {
283 |                 // we added a \0 past the end just to detect the end, so let's revert to the actual end
284 |                 current_char--;
285 |             }
286 |         }
287 |     }
288 |     if (current_start < char_end) {
289 |         fwrite(current_start, sizeof(char), char_end - current_start, config.target);
290 |     }
291 |     fflush(config.target);
292 | }
293 | 
294 | void start_awk() {
295 |     char* prefix = "awk \'BEGIN{ FS=\"\\x1F\"; RS=\"\\x1E\" } ";
296 |     char* command = calloc(strlen(prefix) + strlen(config.script) + 2, sizeof(char));
297 |     sprintf(command, "%s %s\'", prefix, config.script);
298 |     config.target = popen(command, "w");
299 |     if (!config.target) {
300 |         fprintf(stderr, "Can't start \"%s\"\n", command);
301 |         exit(1);
302 |     }
303 |     free(command);
304 | }
305 | 
306 | 


--------------------------------------------------------------------------------
/src/csvcut.c:
--------------------------------------------------------------------------------
  1 | #include <stdbool.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <unistd.h>
  5 | #include <limits.h>     
  6 | #include <stdbool.h>
  7 | #include <assert.h>
  8 | #include <string.h>
  9 | #include "csv_tokenizer.h"
 10 | #include "debug.h"
 11 | #include "hints.h"
 12 | 
 13 | //#define BUFFER_SIZE 30
 14 | //#define BUFFER_SIZE 72
 15 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2
 16 | 
 17 | 
 18 | struct csv_tokenizer* _tokenizer;
 19 | 
 20 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX];
 21 | static Cell _cells[CELL_BUFFER_SIZE];
 22 | 
 23 | static struct {
 24 |     FILE* source;
 25 |     FILE* target;
 26 | 
 27 |     char separator;
 28 |     char newline[2];
 29 |     size_t newline_length;
 30 | 
 31 |     bool* keep;
 32 |     int column_count;
 33 |     int first_cell;
 34 | } config;
 35 | 
 36 | 
 37 | static void parse_config(int argc, char** argv);
 38 | static void finish_config(size_t cells_found);
 39 | 
 40 | static void output_cells(size_t cells_found, bool last_full);
 41 | static void debug_cells(size_t total);
 42 | 
 43 | int main(int argc, char** argv) {
 44 |     size_t chars_read;
 45 |     bool first = true;
 46 | 
 47 |     parse_config(argc, argv);
 48 | 
 49 |     SEQUENTIAL_HINT(config.source);
 50 |     while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, config.source)) > 0) {
 51 |         LOG_D("New data read: %zu\n", chars_read);
 52 |         size_t buffer_consumed = 0;
 53 |         size_t cells_found = 0;
 54 |         bool last_full = true;
 55 | 
 56 |         prepare_tokenization(_tokenizer, _buffer, chars_read);
 57 |         while (buffer_consumed < chars_read) {
 58 |             tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full);
 59 |             if (first == true) {
 60 |                 first = false;
 61 |                 finish_config(cells_found);
 62 |             }
 63 | 
 64 |             LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found);
 65 |             debug_cells(cells_found);
 66 |             output_cells(cells_found, last_full);
 67 |         }
 68 |     }
 69 |     if (_tokenizer != NULL) {
 70 |         free_tokenizer(_tokenizer);
 71 |     }
 72 |     if (config.keep != NULL) {
 73 |         free(config.keep);
 74 |     }
 75 |     if (config.source != stdin) {
 76 |         fclose(config.source);
 77 |     }
 78 |     return 0;
 79 | }
 80 | 
 81 | static void debug_cells(size_t total) {
 82 | #ifdef MOREDEBUG
 83 |     Cell* current_cell = _cells;
 84 |     Cell* cell_end = _cells + total;
 85 | 
 86 |     while (current_cell < cell_end) {
 87 |         if (current_cell->start == NULL) {
 88 |             LOG_V("Cell %zu : Newline\n", (size_t)(current_cell - _cells));
 89 |         }
 90 |         else if (current_cell->length == 0) {
 91 |             LOG_V("Cell %zu : \n", (size_t)(current_cell - _cells));
 92 |         }
 93 |         else {
 94 |             char* s = calloc(sizeof(char), current_cell->length + 1);
 95 |             s[current_cell->length] = '\0';
 96 |             memcpy(s, current_cell->start, current_cell->length);
 97 |             LOG_V("Cell %zu : %s\n", (size_t)(current_cell - _cells), s);
 98 |             free(s);
 99 |         }
100 |         current_cell++;
101 |     }
102 | #else
103 |     (void)total;
104 | #endif
105 | }
106 | 
107 | static void print_help() {
108 |     fprintf(stderr,"usage: csvcut [OPTIONS] [FILE]");
109 |     fprintf(stderr,"options:");
110 |     fprintf(stderr, "-s ,\n");
111 |     fprintf(stderr, "\tWhich character to use as separator (default is ,)\n");
112 |     fprintf(stderr, "-k column,names,to,keep\n");
113 |     fprintf(stderr, "-d column,names,to,drop\n");
114 |     fprintf(stderr, "-K 0,1,3\n");
115 |     fprintf(stderr, "\tWhich columns to keep\n");
116 |     fprintf(stderr, "-D 0,1,3\n");
117 |     fprintf(stderr, "\tWhich columns to drop\n");
118 |     fprintf(stderr, "-e\n");
119 |     fprintf(stderr, "\tProvide column names one at a time, useful in case of embedded commas.\n");
120 | }
121 | 
122 | enum column_kind {
123 |     NONE,
124 |     KEEP_NAMES,
125 |     KEEP_INDEXES,
126 |     DROP_NAMES,
127 |     DROP_INDEXES
128 | };
129 | 
130 | static struct {
131 |     enum column_kind kind;
132 |     size_t cuts_defined;
133 |     const char** cuts;
134 | } preconfig;
135 | 
136 | static void parse_config(int argc, char** argv) {
137 |     config.separator = ',';
138 |     config.source = stdin;
139 | 
140 |     
141 |     preconfig.kind = NONE;
142 |     preconfig.cuts_defined = 0;
143 |     preconfig.cuts = NULL;
144 | 
145 |     bool one_at_a_time =  false;
146 |     char c;
147 |     while ((c = getopt (argc, argv, "s:k:d:K:D:eh")) != -1) {
148 |         switch (c) {
149 |             case 'e':
150 |                 one_at_a_time = true;
151 |                 break;
152 |             case 's': 
153 |                 config.separator = optarg[0];
154 |                 break;
155 |             case 'k':
156 |             case 'd':
157 |             case 'K':
158 |             case 'D':
159 |                 if (!one_at_a_time) {
160 |                     if (preconfig.kind != NONE) {
161 |                         fprintf(stderr, "Error, you can only pass one kind of cut option.\n");
162 |                         exit(1);
163 |                     }
164 |                     preconfig.cuts = malloc(sizeof(char*));
165 |                     preconfig.cuts_defined = 1;
166 |                     preconfig.cuts[0] = strtok(optarg, ",");
167 |                     char* next_column;
168 |                     while ((next_column = strtok(NULL, ",")) != NULL) {
169 |                         preconfig.cuts_defined++;
170 |                         preconfig.cuts = realloc(preconfig.cuts, sizeof(char*) * preconfig.cuts_defined);
171 |                         preconfig.cuts[preconfig.cuts_defined - 1] = next_column;
172 |                     }
173 |                 }
174 |                 else {
175 |                     if (!preconfig.cuts) {
176 |                         preconfig.cuts = malloc(sizeof(char*));
177 |                         preconfig.cuts_defined = 1;
178 |                     }
179 |                     else {
180 |                         preconfig.cuts_defined++;
181 |                         preconfig.cuts = realloc(preconfig.cuts, sizeof(char*) * preconfig.cuts_defined);
182 |                     }
183 |                     preconfig.cuts[preconfig.cuts_defined - 1] = optarg;
184 |                 }
185 |                 if (c == 'k') {
186 |                     if (preconfig.kind != NONE && preconfig.kind != KEEP_NAMES) {
187 |                         fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n");
188 |                         print_help();
189 |                         exit(1);
190 |                     }
191 |                     preconfig.kind = KEEP_NAMES;
192 |                 }
193 |                 else if (c == 'd') {
194 |                     if (preconfig.kind != NONE && preconfig.kind != DROP_NAMES) {
195 |                         fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n");
196 |                         print_help();
197 |                         exit(1);
198 |                     }
199 |                     preconfig.kind = DROP_NAMES;
200 |                 }
201 |                 else if (c == 'K') {
202 |                     if (preconfig.kind != NONE && preconfig.kind != KEEP_INDEXES) {
203 |                         fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n");
204 |                         print_help();
205 |                         exit(1);
206 |                     }
207 |                     preconfig.kind = KEEP_INDEXES;
208 |                 }
209 |                 else if (c == 'D') {
210 |                     if (preconfig.kind != NONE && preconfig.kind != DROP_INDEXES) {
211 |                         fprintf(stderr, "You can only choose one mode of dropping/keeping columns\n");
212 |                         print_help();
213 |                         exit(1);
214 |                     }
215 |                     preconfig.kind = DROP_INDEXES;
216 |                 }
217 |                 break;
218 |             case '?':
219 |             case 'h':
220 |                 print_help();
221 |                 exit(1);
222 |                 break;
223 |         }
224 |     }
225 | 
226 |     if (preconfig.kind == NONE) {
227 |         fprintf(stderr, "You should describe how you want to cut the csv\n");
228 |         print_help();
229 |         exit(1);
230 |     }
231 | 
232 |     if (optind < argc) {
233 |         config.source = fopen(argv[optind], "r");
234 |         if (!config.source) {
235 |             fprintf(stderr, "Could not open file %s for reading\n", argv[optind]);
236 |             exit(1);
237 |         }
238 |     }
239 | 
240 |     LOG_D("%s\n","Done parsing config params");    
241 | 
242 |     _tokenizer = setup_tokenizer(config.separator, _buffer, _cells,CELL_BUFFER_SIZE);
243 | }
244 | 
245 | static char _unquote_buffer[BUFFER_SIZE];
246 | static char const * unquote(char const* restrict quoted, size_t* restrict length) {
247 |     char * restrict result = _unquote_buffer;
248 |     char const * restrict current_char = quoted; 
249 |     char const * restrict char_end = quoted + *length; 
250 |     while (current_char < char_end) {
251 |         if (*current_char == '"') {
252 |             // must be an escaped "
253 |             current_char++;
254 |             (*length)--;
255 |         }
256 |         *result++ = *current_char++;
257 |     }
258 |     return _unquote_buffer;
259 | }
260 | 
261 | bool str_contains_n(size_t amount, const char** strings, const char* needle, size_t needle_size) {
262 |     if (*needle == '"') {
263 |         needle++;
264 |         needle_size -= 2;
265 |         needle = unquote(needle, &needle_size);
266 |     }
267 |     for (size_t i = 0; i < amount; i++) {
268 |         if (strlen(strings[i]) == needle_size && strncasecmp(strings[i], needle, needle_size) == 0) {
269 |             return true;
270 |         }
271 |     }
272 |     return false;
273 | }
274 | 
275 | static void finish_config(size_t cells_found) {
276 |     debug_cells(cells_found);
277 | 
278 |     Cell const* current_cell = _cells;
279 |     while (current_cell < (_cells + cells_found) && current_cell->start != NULL) {
280 |         current_cell++;
281 |     }
282 |     config.column_count = (int)(current_cell - _cells);
283 | 
284 |     const char* new_line = _cells[config.column_count-1].start + _cells[config.column_count - 1].length;
285 |     config.newline[0] = new_line[0];
286 |     config.newline_length = 1;
287 |     if (new_line[1] == '\n' || new_line[0] == '\r') {
288 |         config.newline[1] = '\n';
289 |         config.newline_length = 2;
290 |     }
291 | 
292 |     config.keep = calloc(sizeof(bool), config.column_count);
293 |     for (int c = 0; c < config.column_count; c++) {
294 |         config.keep[c] =  false;
295 |     }
296 | 
297 |     if (preconfig.kind == KEEP_NAMES || preconfig.kind == DROP_NAMES) {
298 |         for (int c = 0; c < config.column_count; c++) {
299 |             bool cond = str_contains_n(preconfig.cuts_defined, preconfig.cuts, _cells[c].start, _cells[c].length);
300 |             if ((cond && (preconfig.kind == KEEP_NAMES)) || (!cond && (preconfig.kind == DROP_NAMES))) {
301 |                 config.keep[c] = true;
302 |             }
303 |         }
304 |     }
305 |     else if (preconfig.kind == KEEP_INDEXES || preconfig.kind == DROP_INDEXES) {
306 |         for (int c = 0; c < config.column_count; c++) {
307 |             char str_index[15];
308 |             int str_length = sprintf(str_index, "%d", c);
309 |             bool cond = str_contains_n(preconfig.cuts_defined, preconfig.cuts, str_index, str_length);
310 |             if ((cond && (preconfig.kind == KEEP_INDEXES)) || (!cond && (preconfig.kind == DROP_INDEXES))) {
311 |                 config.keep[c] = true;
312 |             }
313 |         }
314 |     }
315 |     else {
316 |         assert(false);
317 |     }
318 |     free(preconfig.cuts);
319 |     for (int c = 0; c < config.column_count; c++) {
320 |         if (config.keep[c]) {
321 |             config.first_cell = c;
322 |             break;
323 |         }
324 |     }
325 | }
326 | 
327 | static bool _half_printed = false;
328 | static int _current_cell_id = 0;
329 | 
330 | static void output_cells(size_t cells_found, bool last_full) {
331 |     LOG_D("Starting output: %zu (%d)\n", cells_found, last_full);
332 |     LOG_V("Entry: current_cell: %d\n", _current_cell_id);
333 |     Cell const * restrict current_cell = _cells;
334 |     Cell const * restrict cell_end = _cells + cells_found;
335 | 
336 |     char const * restrict current_chunk_start = current_cell->start;
337 |     size_t current_chunk_length = 0;
338 |     int current_chunk_start_id = _current_cell_id;
339 | 
340 |     while (current_cell < cell_end) {
341 |         //LOG_D("Current cell: %d %p\n", _current_cell_id,current_cell->start);
342 |         if (current_cell->start == NULL) {
343 |             if (_current_cell_id < config.column_count) {
344 |                 fprintf(stderr, "Not enough cells in this row, expect: %d, got: %d (cell %zu)\n", config.column_count, _current_cell_id,  (size_t)(current_cell - _cells));
345 |                 exit(1);
346 |                 return;
347 |             }
348 |             if (current_chunk_start != NULL) {
349 |                 current_chunk_length--; // take away newline 
350 |                 if (current_chunk_start != _buffer || !_half_printed) {
351 |                     if (current_chunk_start_id != config.first_cell) {
352 |                         fwrite(&(config.separator),sizeof(char),1, stdout);
353 |                     }
354 |                 }
355 |                 fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout);
356 |             }
357 |             fwrite(config.newline, sizeof(char), config.newline_length, stdout);
358 |             current_chunk_start = (current_cell + 1)->start;
359 |             current_chunk_length = 0;
360 |             current_chunk_start_id = 0;
361 |             _current_cell_id = -1;
362 |         }
363 |         if (_current_cell_id >= config.column_count) {
364 |             fprintf(stderr, "Too many cells in this row, expect: %d, got: %d (cell: %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells));
365 |             exit(1);
366 |             return;
367 |         }
368 |         else if (config.keep[_current_cell_id]) {
369 |             current_chunk_length += 1 + current_cell->length;
370 |         }
371 |         else {
372 |             // a column to drop, so lets write the previous chunk
373 |             if (_current_cell_id >= config.first_cell && current_chunk_length > 0) {
374 |                 current_chunk_length--; // take away last seperator
375 |                 if (current_chunk_start != _buffer || !_half_printed) {
376 |                     if (current_chunk_start_id != config.first_cell) {
377 |                         fwrite(&(config.separator),sizeof(char),1, stdout);
378 |                     }
379 |                 }
380 |                 fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout);
381 |             }
382 |             // begining of the line, nothing happening
383 |             current_chunk_start = (current_cell + 1)->start;
384 |             current_chunk_length = 0;
385 |             current_chunk_start_id = _current_cell_id + 1;
386 |         }
387 | 
388 |         _current_cell_id++;
389 |         current_cell++;
390 |     }
391 |     if (current_chunk_length > 0) {
392 |         current_chunk_length--; // fix of by one error 
393 |         if (current_chunk_start != _buffer || !_half_printed) {
394 |             if (current_chunk_start_id != config.first_cell) {
395 |                 fwrite(&(config.separator),sizeof(char),1, stdout);
396 |             }
397 |         }
398 |         fwrite(current_chunk_start, sizeof(char), current_chunk_length, stdout);
399 |     }
400 |     if (!last_full) {
401 | 
402 |         _half_printed = true;
403 |         _current_cell_id--;
404 |     }
405 |     else {
406 |         _half_printed = false;
407 |     }
408 |     LOG_V("Exit: current_cell: %d\n", _current_cell_id);
409 | }
410 | 


--------------------------------------------------------------------------------
/src/csvgrep.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <unistd.h>
  3 | #include <stdbool.h>
  4 | #include <assert.h>
  5 | #include <string.h>
  6 | #include <pcre.h> 
  7 | #include "csv_tokenizer.h"
  8 | #include "debug.h"
  9 | #include "hints.h"
 10 | #ifndef PCRE_STUDY_JIT_COMPILE
 11 | #define PCRE_STUDY_JIT_COMPILE 0
 12 | #endif
 13 | 
 14 | #ifndef PCRE_CONFIG_JIT
 15 | #define pcre_free_study pcre_free
 16 | #endif
 17 | 
 18 | 
 19 | 
 20 | //#define BUFFER_SIZE 30
 21 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2
 22 | 
 23 | typedef struct {
 24 |     pcre const* restrict pattern;
 25 |     pcre_extra const* restrict extra;
 26 | } Regex;
 27 | 
 28 | struct csv_tokenizer* _tokenizer;
 29 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX];
 30 | static Cell _cells[CELL_BUFFER_SIZE];
 31 | 
 32 | static int _have_jit = 0;
 33 | 
 34 | static struct {
 35 |     FILE* source;
 36 |     char separator;
 37 |     char newline[2];
 38 |     size_t newline_length;
 39 | 
 40 |     bool count_only;
 41 |     bool negative;
 42 |     bool or;
 43 |     bool case_insensitive;
 44 |     Regex* patterns;
 45 | 
 46 |     int column_count;
 47 | } config;
 48 | 
 49 | static long long _count;
 50 | 
 51 | static char const * unquote(char const* restrict quoted, size_t* restrict length);
 52 | static void parse_config(int argc, char** argv);
 53 | static size_t finish_config(size_t cells_found);
 54 | 
 55 | static void output_cells(size_t cells_found, size_t offset, bool last_full);
 56 | static void debug_cells(size_t total);
 57 | 
 58 | int main(int argc, char** argv) {
 59 | 
 60 |     parse_config(argc, argv);
 61 | #ifdef PCRE_CONFIG_JIT
 62 |     pcre_config(PCRE_CONFIG_JIT, &_have_jit);
 63 | #else
 64 |     _have_jit = false;
 65 | #endif
 66 |     if (!_have_jit) {
 67 |         fprintf(stderr, "I am running without PCRE-JIT support, expect less performance.\n");
 68 |     }
 69 | 
 70 | 
 71 |     size_t chars_read;
 72 |     bool first = true;
 73 |     SEQUENTIAL_HINT(config.source);
 74 |     while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, config.source)) > 0) {
 75 |         LOG_D("New data read: %zu\n", chars_read);
 76 |         prepare_tokenization(_tokenizer, _buffer, chars_read);
 77 |         size_t buffer_consumed = 0;
 78 |         size_t cells_found = 0;
 79 |         bool last_full = true;
 80 | 
 81 | 
 82 |         while (buffer_consumed < chars_read) {
 83 |             tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full);
 84 |             LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found);
 85 |             debug_cells(cells_found);
 86 | 
 87 |             size_t cell_offset = 0;
 88 |             if (first) {
 89 |                 first = false;
 90 |                 cell_offset = finish_config(cells_found);
 91 |             }
 92 |             output_cells(cells_found, cell_offset, last_full);
 93 |         }
 94 |     }
 95 |     if (config.count_only) {
 96 |         fprintf(stdout, "%llu\n", _count);
 97 |     }
 98 |     if (_tokenizer != NULL) {
 99 |         free_tokenizer(_tokenizer);
100 |     }
101 |     if (config.patterns != NULL) {
102 |         for (int c = 0; c < config.column_count; c++) {
103 |             if (config.patterns[c].extra) {
104 |                 pcre_free_study((pcre_extra*)(config.patterns[c].extra));
105 |             }
106 |             pcre_free((pcre*)(config.patterns[c].pattern));
107 |         }
108 |     }
109 |     if (config.source != stdin) {
110 |         fclose(config.source);
111 |     }
112 |     return 0;
113 | }
114 | 
115 | static void debug_cells(size_t total) {
116 | #ifdef MOREDEBUG
117 |     Cell* current_cell = _cells;
118 |     Cell* cell_end = _cells + total;
119 | 
120 |     while (current_cell < cell_end) {
121 |         if (current_cell->start == NULL) {
122 |             LOG_V("Cell %zu : Newline\n", (size_t)(current_cell - _cells));
123 |         }
124 |         else if (current_cell->length == 0) {
125 |             LOG_V("Cell %zu : \n", (size_t)(current_cell - _cells));
126 |         }
127 |         else {
128 |             char* s = calloc(sizeof(char), current_cell->length + 1);
129 |             s[current_cell->length] = '\0';
130 |             memcpy(s, current_cell->start, current_cell->length);
131 |             LOG_V("Cell %zu : %s\n", (size_t)(current_cell - _cells), s);
132 |             free(s);
133 |         }
134 |         current_cell++;
135 |     }
136 | #else
137 |     (void)total;
138 | #endif
139 | }
140 | 
141 | static void print_help() {
142 |     fprintf(stderr,"usage: csvgrep [OPTIONS] [FILE]");
143 |     fprintf(stderr,"options:");
144 |     fprintf(stderr, "-s ,\n");
145 |     fprintf(stderr, "\tWhich character to use as separator (default is ,)\n");
146 |     fprintf(stderr, "-p column/pattern/\n");
147 |     fprintf(stderr, "\tMultiple -p are allowed, they work as an AND \n");
148 |     fprintf(stderr, "-i\n");
149 |     fprintf(stderr, "\tuse case insensitive matching\n");
150 |     fprintf(stderr, "-c\n");
151 |     fprintf(stderr, "\tOnly count the rows that match\n");
152 |     fprintf(stderr, "-o\n");
153 |     fprintf(stderr, "\tMake the match into an OR, changes the behavior of -p and -v\n");
154 |     fprintf(stderr, "-v\n");
155 |     fprintf(stderr, "\tPrint only the rows that did not match all patterns\n");
156 | }
157 | 
158 | static struct {
159 |     size_t n_patterns;
160 |     char ** columns;
161 |     char ** patterns;
162 |     size_t * column_lengths;
163 | } half_config;
164 | 
165 | static void parse_config(int argc, char** argv) {
166 |     config.source = stdin;
167 |     config.separator = ',';
168 |     config.count_only = false;
169 |     config.negative = false;
170 |     config.case_insensitive = false;
171 |     config.or = false;
172 | 
173 |     half_config.n_patterns = 0;
174 |     half_config.columns = malloc(sizeof(char*));
175 |     half_config.patterns = malloc(sizeof(char*));
176 |     half_config.column_lengths = malloc(sizeof(size_t));
177 | 
178 |     char c;
179 |     while ((c = getopt (argc, argv, "s:p:cvio")) != -1) {
180 |         switch (c) {
181 |             case 's': 
182 |                 config.separator = optarg[0];
183 |                 break;
184 |             case 'c': 
185 |                 config.count_only = true;
186 |                 break;
187 |             case 'i':
188 |                 config.case_insensitive = true;
189 |                 break;
190 |             case 'v':
191 |                 config.negative = true;
192 |                 break;
193 |             case 'o':
194 |                 config.or = true;
195 |                 break;
196 |             case 'p':
197 |                 LOG_V("Got pattern: %s\n", optarg);
198 |                 char* column_name = strtok(optarg, "/"); 
199 |                 char* column_pattern = strtok(NULL, "/");
200 |                 for (size_t pat = 0;  pat < half_config.n_patterns; pat++) {
201 |                     if (strcasecmp(column_name, half_config.columns[pat]) == 0) {
202 |                         fprintf(stderr, "You can only define one pattern per column (column: %s)\n", column_name);
203 |                         exit(1);
204 |                     }
205 |                 }
206 |                 half_config.n_patterns++;
207 |                 if (half_config.n_patterns >= 1) {
208 |                     half_config.columns = realloc(half_config.columns, sizeof(char*) * half_config.n_patterns);
209 |                     half_config.patterns = realloc(half_config.patterns, sizeof(char*) * half_config.n_patterns);
210 |                     half_config.column_lengths = realloc(half_config.column_lengths, sizeof(size_t) * half_config.n_patterns);
211 |                 }
212 |                 half_config.columns[half_config.n_patterns - 1] = column_name;
213 |                 half_config.patterns[half_config.n_patterns - 1] = column_pattern;
214 |                 half_config.column_lengths[half_config.n_patterns - 1] = strlen(column_name);
215 |                 break;
216 |             case '?':
217 |             case 'h':
218 |                 print_help();
219 |                 exit(1);
220 |                 break;
221 |         }
222 |     }
223 |     if (optind < argc) {
224 |         config.source = fopen(argv[optind], "r");
225 |         if (!config.source) {
226 |             fprintf(stderr, "Could not open file %s for reading\n", argv[optind]);
227 |             exit(1);
228 |         }
229 |     }
230 | 
231 |     if (half_config.n_patterns == 0) {
232 |         fprintf(stderr, "You should at least provide one pattern\n");
233 |         print_help();
234 |         exit(1);
235 |     }
236 | 
237 |     LOG_D("%s\n","Done parsing config params");    
238 | 
239 |     _tokenizer = setup_tokenizer(config.separator, _buffer, _cells, CELL_BUFFER_SIZE);
240 | 
241 | }
242 | 
243 | static size_t finish_config(size_t cells_found) {
244 | 
245 |     Cell* current_cell = _cells;
246 |     while (current_cell < (_cells + cells_found) && current_cell->start != NULL) {
247 |         if (!config.count_only) {
248 |             // also immediatly print the header
249 |             if (current_cell != _cells) {
250 |                 fwrite(&(config.separator),sizeof(char),1, stdout);
251 |             }
252 |             fwrite(current_cell->start, sizeof(char), current_cell->length, stdout);
253 |         }
254 |         current_cell++;
255 |     }
256 |     config.column_count = (int)(current_cell - _cells);
257 | 
258 |     const char* new_line = _cells[config.column_count-1].start + _cells[config.column_count - 1].length;
259 |     config.newline[0] = new_line[0];
260 |     config.newline_length = 1;
261 |     if (new_line[1] == '\n' && new_line[0] == '\r') {
262 |         config.newline[1] = '\n';
263 |         config.newline_length = 2;
264 |     }
265 |     if (!config.count_only) {
266 |         fwrite(config.newline, sizeof(char), config.newline_length, stdout);
267 |     }
268 | 
269 |     bool* used = calloc(sizeof(bool), half_config.n_patterns);
270 |     memset(used, 0, sizeof(bool) * half_config.n_patterns);
271 |     config.patterns = calloc(sizeof(Regex),config.column_count);
272 |     memset(config.patterns, 0, sizeof(Regex) * config.column_count);
273 |     for (int c = 0; c < config.column_count; c++) {
274 |         const char* column = _cells[c].start;
275 |         size_t length = _cells[c].length;
276 |         if (*column == '"') {
277 |             column++;
278 |             length -= 2;
279 |             column = unquote(column, &length);
280 |         }
281 |         for (size_t pat = 0;  pat < half_config.n_patterns; pat++) {
282 |             if (!used[pat] && length == half_config.column_lengths[pat]) {
283 |                 if (strncasecmp(column, half_config.columns[pat], half_config.column_lengths[pat])==0) {
284 |                     used[pat] = true;
285 |                     LOG_V("Adding pattern %s for column: %s (%d)\n", half_config.patterns[pat], half_config.columns[pat],c);
286 |                     // we have found the column
287 |                     const char *pcreErrorStr;
288 |                     int pcreErrorOffset;
289 |                     config.patterns[c].pattern = pcre_compile(half_config.patterns[pat], PCRE_DOLLAR_ENDONLY |  PCRE_DOTALL | PCRE_NO_UTF8_CHECK | (config.case_insensitive ? PCRE_CASELESS : 0), &pcreErrorStr, &pcreErrorOffset, NULL); 
290 |                     if(config.patterns[c].pattern == NULL) {
291 |                         fprintf(stderr, "ERROR: Could not compile '%s': %s\n", half_config.patterns[pat], pcreErrorStr);
292 |                         exit(1);
293 |                     }
294 |                     config.patterns[c].extra = pcre_study(config.patterns[c].pattern,(_have_jit ? PCRE_STUDY_JIT_COMPILE : 0), &pcreErrorStr);
295 |                     if(config.patterns[c].extra == NULL && pcreErrorStr != NULL) {
296 |                         fprintf(stderr, "ERROR: Could not study '%s': %s\n", half_config.patterns[pat], pcreErrorStr);
297 |                         exit(1);
298 |                     }
299 |                     break;
300 |                 }
301 |             }
302 |         }
303 |     }
304 | 
305 |     bool stop = false;
306 |     for (size_t pat = 0;  pat < half_config.n_patterns; pat++) {
307 |         if (!used[pat]) {
308 |             fprintf(stderr, "ERROR: The column \"%s\" was not found in the header\n", half_config.columns[pat]);
309 |             stop = true;
310 |         }
311 |     }
312 |     if (stop) {
313 |         fprintf(stderr, "Exiting\n");
314 |         exit(1);
315 |     }
316 | 
317 |     free(used);
318 |     free(half_config.columns);
319 |     free(half_config.patterns);
320 |     free(half_config.column_lengths);
321 | 
322 |     return config.column_count + 1 ;
323 | }
324 | 
325 | 
326 | // data for around the edges
327 | static char _prev_line[BUFFER_SIZE * 2];
328 | static size_t _prev_line_length = 0;
329 | static char _prev_cell[BUFFER_SIZE];
330 | static size_t _prev_cell_length = 0;
331 | 
332 | // state of the output
333 | static int _current_cell_id = 0;
334 | static bool _half_line = false;
335 | static bool _half_cell = false;
336 | static bool _prev_matches = true;
337 | 
338 | static void output_cells(size_t cells_found, size_t offset, bool last_full) {
339 |     LOG_D("Starting output: %zu (%d)\n", cells_found, last_full);
340 |     LOG_V("Entry: current_cell: %d\n", _current_cell_id);
341 | 
342 |     Cell const* restrict current_cell = _cells + offset;
343 |     Cell const* restrict cells_end = _cells + cells_found;
344 | 
345 |     bool matches = !config.or;
346 |     if (_half_line) {
347 |         matches = _prev_matches;
348 |     }
349 |     char const* restrict current_line_start = current_cell->start;
350 |     size_t current_line_length = 0;
351 | 
352 |     while (current_cell < cells_end) {
353 |         if (_current_cell_id > config.column_count) {
354 |             fprintf(stderr, "Too many cells in this row, expect: %d, got: %d (cell: %zu)\n", config.column_count, _current_cell_id, (size_t)(current_cell - _cells));
355 |             exit(1);
356 |             return;
357 |         }
358 |         if (current_cell->start == NULL) {
359 |             if (_current_cell_id == config.column_count) {
360 |                 // end of the line
361 |                 if (matches) {
362 |                     if (config.count_only) {
363 |                         _half_line = false;
364 |                         _prev_line_length = 0;
365 |                         _count++;
366 |                     }
367 |                     else {
368 |                         if (_half_line) {
369 |                             LOG_V("Printed previous half line %zu\n", _prev_line_length);
370 |                             fwrite(_prev_line, sizeof(char), _prev_line_length, stdout);
371 |                             _half_line = false;
372 |                             _prev_line_length = 0;
373 |                         }
374 |                         fwrite(current_line_start, sizeof(char), current_line_length, stdout);
375 |                         fwrite(config.newline, sizeof(char), config.newline_length, stdout);
376 |                     }
377 |                 }
378 |                 else if (_half_line) {
379 |                     // we stored the previos part of this line, but it can be dropped
380 |                     _half_line = false;  
381 |                     _prev_line_length = 0;
382 |                 }
383 |                 current_line_start = (current_cell + 1)->start;
384 |                 current_line_length = 0;
385 |                 _current_cell_id = -1;
386 |                 matches = !config.or;
387 |             }
388 |             else if (_current_cell_id < config.column_count) {
389 |                 fprintf(stderr, "Not enough cells in this row, expect: %d, got: %d (cell %zu)\n", config.column_count, _current_cell_id,  (size_t)(current_cell - _cells));
390 |                 exit(1);
391 |                 return;
392 |             }
393 |         }
394 |         else if (matches || config.or) { // only if we have a match does it make sense to test other cells
395 |             current_line_length += 1 + current_cell->length;
396 |             if (_current_cell_id == 0 || current_cell == (_cells + offset)) {
397 |                 current_line_length--; // the first doesn't have a separator
398 |             }
399 |             if (config.patterns[_current_cell_id].pattern != NULL) {
400 |                 char const* restrict cell = current_cell->start;
401 |                 size_t length = current_cell->length;
402 |                 if (current_cell == (cells_end-1) && !last_full) {
403 |                     // we do not have the full cell at the moment, let's copy it
404 |                     size_t old_cell_length = _prev_cell_length;
405 |                     _prev_cell_length += current_cell->length;
406 |                     memcpy(_prev_cell + old_cell_length, current_cell->start, sizeof(char) * current_cell->length);
407 |                     _half_cell = true;
408 |                     _current_cell_id++;
409 |                     break;
410 |                 }
411 |                 if (_half_cell && current_cell == _cells) {
412 |                     // append the current cell to the back of the previous one.
413 |                     assert(_prev_cell_length + length < BUFFER_SIZE);
414 |                     memcpy(_prev_cell + _prev_cell_length, cell, sizeof(char) * length);
415 |                     cell = _prev_cell;
416 |                     length +=  _prev_cell_length;
417 |                     _prev_cell_length = 0;
418 |                 }
419 |                 if (length > 1 && cell[0] == '"') {
420 |                     cell++;
421 |                     length -= 2;
422 |                     char const* restrict c = cell-1;
423 |                     char const* restrict cell_end = cell + length;
424 |                     while (++c < cell_end && *c != '"');
425 |                     if (c != cell_end) {
426 |                         // we have nested quotes
427 |                         cell = unquote(cell, &length);
428 |                     }
429 |                 }
430 |                 int ovector[255];
431 |                 int matchResult = pcre_exec(config.patterns[_current_cell_id].pattern, config.patterns[_current_cell_id].extra, cell, length, 0, 0, ovector, 255);
432 |                 if (config.or) {
433 |                     matches |= (matchResult >= 0) ^ config.negative;
434 |                 }
435 |                 else {
436 |                     matches &= (matchResult >= 0) ^ config.negative;
437 |                 }
438 | #ifdef MOREDEBUG
439 |                 if (matchResult < 0) {
440 |                     fprintf(stderr, "tried to match :'");
441 |                     fwrite(cell, sizeof(char), length, stderr);
442 |                     fprintf(stderr, "'\n");
443 |                     switch(matchResult) {
444 |                         case PCRE_ERROR_NOMATCH      : fprintf(stderr,"String did not match the pattern\n");        break;
445 |                         case PCRE_ERROR_NULL         : fprintf(stderr,"Something was null\n");                      break;
446 |                         case PCRE_ERROR_BADOPTION    : fprintf(stderr,"A bad option was passed\n");                 break;
447 |                         case PCRE_ERROR_BADMAGIC     : fprintf(stderr,"Magic number bad (compiled re corrupt?)\n"); break;
448 |                         case PCRE_ERROR_UNKNOWN_NODE : fprintf(stderr,"Something kooky in the compiled re\n");      break;
449 |                         case PCRE_ERROR_NOMEMORY     : fprintf(stderr,"Ran out of memory\n");                       break;
450 |                         default                      : fprintf(stderr,"Unknown error\n");                           break;
451 |                     }
452 |                 }
453 | #endif
454 |             }
455 |         }
456 | 
457 |         _current_cell_id++;
458 |         current_cell++;
459 |     }
460 |     if (_current_cell_id != 0) {
461 |         // the last row wasn't completly printed, so we must be inside a row
462 |         _prev_matches = matches;
463 |         if (_prev_matches) {
464 |             // it could still match, so let's copy the line
465 |             size_t old_line_length = _prev_line_length;
466 |             _prev_line_length += current_line_length;
467 |             assert(_prev_line_length < (BUFFER_SIZE * 2));
468 |             memcpy(_prev_line + old_line_length, current_line_start, sizeof(char) * current_line_length);
469 |             if (last_full && _current_cell_id != config.column_count) { // the , gets eaten away
470 |                 _prev_line[_prev_line_length++] = config.separator;
471 |             }
472 | #ifdef MOREDEBUG
473 |             fprintf(stderr, "current prev line :'");
474 |             fwrite(_prev_line, sizeof(char), _prev_line_length, stderr);
475 |             fprintf(stderr, "'\n");
476 | #endif
477 |         }
478 |         _half_line = true;
479 |         if (!last_full) {
480 |             _current_cell_id--;
481 |         }
482 |     }
483 |     else {
484 |         _half_line = false;
485 |     }
486 |     LOG_V("Exit: current_cell: %d\n", _current_cell_id);
487 | }
488 | 
489 | 
490 | static char _unquote_buffer[BUFFER_SIZE];
491 | static char const * unquote(char const* restrict quoted, size_t* restrict length) {
492 |     char * restrict result = _unquote_buffer;
493 |     char const * restrict current_char = quoted; 
494 |     char const * restrict char_end = quoted + *length; 
495 |     while (current_char < char_end) {
496 |         if (*current_char == '"') {
497 |             // must be an escaped "
498 |             current_char++;
499 |             (*length)--;
500 |         }
501 |         *result++ = *current_char++;
502 |     }
503 |     return _unquote_buffer;
504 | }
505 | 


--------------------------------------------------------------------------------
/src/csvpipe.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <unistd.h>
  4 | #include <stdbool.h>
  5 | #include <string.h>
  6 | #include <assert.h>
  7 | #include "debug.h"
  8 | #include "hints.h"
  9 | 
 10 | 
 11 | #define NULL_ENCODED '\x1a'
 12 | 
 13 | //#define BUFFER_SIZE 3
 14 | static char _buffer[BUFFER_SIZE];
 15 | 
 16 | struct {
 17 |     FILE* source;
 18 |     bool drop_header;
 19 | } config;
 20 | 
 21 | static void parse_config(int argc, char** argv);
 22 | static void do_pipe(size_t chars_read);
 23 | 
 24 | int main(int argc, char** argv) {
 25 |     parse_config(argc, argv);
 26 | 
 27 |     size_t chars_read;
 28 |     SEQUENTIAL_HINT(config.source);
 29 |     while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, config.source)) > 0) {
 30 |         do_pipe(chars_read);
 31 |     }
 32 |     return 0;
 33 | }
 34 | 
 35 | static void print_help() {
 36 |     fprintf(stderr, "usage: csvpipe [OPTIONS] [FILE]");
 37 |     fprintf(stderr, "options:");
 38 |     fprintf(stderr, "-d\n");
 39 |     fprintf(stderr, "  drop header row\n");
 40 | }
 41 | 
 42 | static void parse_config(int argc, char** argv) {
 43 |     config.source = stdin;
 44 |     config.drop_header = false;
 45 |     char c;
 46 |     while ((c = getopt (argc, argv, "d")) != -1) {
 47 |         switch (c) {
 48 |             case 'd':
 49 |                 config.drop_header = true;
 50 |                 break;
 51 |             case '?':
 52 |             case 'h':
 53 |             default:
 54 |                 print_help();
 55 |                 exit(1);
 56 |                 break;
 57 |         }
 58 |     }
 59 |     if (optind < argc) {
 60 |         config.source = fopen(argv[optind], "r");
 61 |         if (!config.source) {
 62 |             fprintf(stderr, "Could not open file %s for reading\n", argv[optind]);
 63 |             exit(1);
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | enum tokenizer_state {
 69 |     FRESH,
 70 |     PREV_NEWLINE,
 71 |     PREV_QUOTE,
 72 |     IN_QUOTE,
 73 | };
 74 | 
 75 | void replace_zeroes(char* restrict current_char, char const* restrict char_end) {
 76 |     while (current_char != NULL) {
 77 |         current_char = memchr(current_char, '\0', char_end - current_char);
 78 |         if (current_char != NULL) {
 79 |             *current_char = NULL_ENCODED;
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | 
 85 | static bool first_run = true;
 86 | static enum tokenizer_state _state = FRESH;
 87 | 
 88 | static void do_pipe(size_t chars_read) {
 89 |     char* restrict current_char = _buffer;
 90 |     char const* restrict char_end = _buffer + chars_read;
 91 |     char const* restrict current_start = _buffer;
 92 |     LOG_V("Piping: %zu state: %d first char: %c\n", chars_read, _state, *current_char);
 93 | 
 94 |     if (config.drop_header && first_run) {
 95 |         while (current_char < char_end) {
 96 |             if (*current_char == '\n' || *current_char == '\r') {
 97 |                 if (*current_char == '\r') {
 98 |                     _state = PREV_NEWLINE; // handle the windows newlines correctly
 99 |                 }
100 |                 current_start = ++current_char;
101 |                 first_run = false;
102 |                 break;
103 |             }
104 |             current_char++;
105 |         }
106 |         if (current_char == char_end) {
107 |             return;
108 |         }
109 |     }
110 |     // doing this separatly greatly improves the speed of the loop below
111 |     replace_zeroes(current_char, char_end);
112 | 
113 |     switch(_state) {
114 |         case PREV_QUOTE:
115 |             _state = FRESH; // reset state
116 |             if (*current_char == '"') {
117 |                 // we have two quotes
118 |                 // one in the previous block, one in the current
119 |                 goto IN_QUOTE;
120 |             }
121 |             // we were at the end of the quoted cell, so let's continue
122 |             break;
123 |         case IN_QUOTE:
124 |             current_char--; // the loop starts with a increment
125 |             goto IN_QUOTE;
126 |         case PREV_NEWLINE:
127 |             if (*current_char == '\n') {
128 |                 // we already had a newline, so lets eat this second windows
129 |                 // newline
130 |                 current_char++;
131 |                 current_start++;
132 |             }
133 |             _state = FRESH;
134 |             break;
135 |         default:
136 |             break;
137 |     }
138 | 
139 |     while (current_char < char_end) {
140 |         if (*current_char == '"') {
141 | IN_QUOTE:
142 |             while (++current_char < char_end) {
143 |                 if (*current_char == '"') {
144 |                     char const* peek = current_char + 1;
145 |                     if (peek == char_end) {
146 |                         current_char++;
147 |                         _state = PREV_QUOTE;
148 |                         // at the end of stream and not sure if escaped or not
149 |                         break;
150 |                     }
151 |                     else if (*peek == '"') {
152 |                         current_char++;
153 |                         continue;
154 |                     }
155 |                     else {
156 |                         break;
157 |                     }
158 |                 }
159 |             }
160 |             if (current_char == char_end) {
161 |                 // we are at the end, let's write everything we've seen
162 |                 if (_state != PREV_QUOTE) {
163 |                     _state = IN_QUOTE;
164 |                 }
165 |                 break;
166 |             }
167 |             else {
168 |                 current_char++;
169 |                 _state = FRESH;
170 |             }
171 |         }
172 |         else if (*current_char == '\n') {
173 |             *current_char = '\0';
174 |             current_char++;
175 |         }
176 |         else if (*current_char == '\r') {
177 |             *current_char = '\0';
178 |             current_char++;
179 |             if (current_char == char_end) {
180 |                 _state = PREV_NEWLINE;
181 |                 break;
182 |             }
183 |             else if (*current_char == '\n') {
184 |                 // we have windows new lines, so lets skip over this byte
185 |                 fwrite(current_start, sizeof(char), current_char - current_start, stdout);
186 |                 current_char++;
187 |                 current_start = current_char;
188 |             }
189 |         }
190 |         else {
191 |             // all other chars, just skip one
192 |             current_char++;
193 |         }
194 |     }
195 |     if (current_start < char_end) {
196 |         fwrite(current_start, sizeof(char), char_end - current_start, stdout);
197 |     }
198 | }
199 | 
200 | 


--------------------------------------------------------------------------------
/src/csvunpipe.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <unistd.h>
 4 | #include <stdbool.h>
 5 | #include <string.h>
 6 | #include <assert.h>
 7 | #include <string.h>
 8 | #include "debug.h"
 9 | #include "hints.h"
10 | 
11 | 
12 | #define NULL_ENCODED '\x1a'
13 | 
14 | //#define BUFFER_SIZE 3
15 | 
16 | static char _buffer[BUFFER_SIZE];
17 | 
18 | static FILE* _source;
19 | 
20 | static void parse_config(int argc, char** argv);
21 | static void do_unpipe(size_t chars_read);
22 | 
23 | int main(int argc, char** argv) {
24 |     parse_config(argc, argv);
25 | 
26 |     size_t chars_read;
27 |     SEQUENTIAL_HINT(_source);
28 |     while ((chars_read = fread(_buffer, sizeof(char), BUFFER_SIZE, _source)) > 0) {
29 |         do_unpipe(chars_read);
30 |     }
31 |     if (_source != stdin) {
32 |         fclose(_source);
33 |     }
34 |     return 0;
35 | }
36 | 
37 | static void print_help() {
38 |     fprintf(stderr, "usage: csvunpipe [OPTIONS] [FILE]");
39 |     fprintf(stderr, "options:");
40 |     fprintf(stderr, "-p header,row,to,print\n");
41 |     fprintf(stderr, "  Header row to print first\n");
42 | }
43 | 
44 | static void parse_config(int argc, char** argv) {
45 |     _source = stdin;
46 |     char c;
47 |     while ((c = getopt (argc, argv, "p:")) != -1) {
48 |         switch (c) {
49 |             case 'p':
50 |                 fwrite(optarg, sizeof(char), strlen(optarg), stdout);
51 |                 fwrite("\n", sizeof(char), 1, stdout);
52 |                 break;
53 |             case '?':
54 |             case 'h':
55 |             default:
56 |                 print_help();
57 |                 exit(1);
58 |                 break;
59 |         }
60 |     }
61 |     if (optind < argc) {
62 |         _source = fopen(argv[optind], "r");
63 |         if (!_source) {
64 |             fprintf(stderr, "Could not open file %s for reading\n", argv[optind]);
65 |             exit(1);
66 |         }
67 |     }
68 | }
69 | 
70 | static void do_unpipe(size_t chars_read) {
71 |     char* restrict current_char = _buffer;
72 |     char const* restrict char_end = _buffer + chars_read;
73 | 
74 |     while (current_char != NULL) {
75 |         current_char = memchr(current_char, '\0', char_end - current_char);
76 |         if (current_char != NULL) {
77 |             *current_char = '\n';
78 |         }
79 |     }
80 |     current_char = _buffer;
81 |     while (current_char != NULL) {
82 |         current_char = memchr(current_char, NULL_ENCODED, char_end - current_char);
83 |         if (current_char != NULL) {
84 |             *current_char = '\0';
85 |         }
86 |     }
87 |     fwrite(_buffer, sizeof(char), chars_read, stdout);
88 | }
89 | 
90 | 


--------------------------------------------------------------------------------
/src/debug.h:
--------------------------------------------------------------------------------
 1 | #define debug_print(fmt, ...) do { fprintf(stderr, fmt, __VA_ARGS__); } while (0)
 2 | #ifdef DEBUG
 3 | 	#define LOG_D(fmt, ...) debug_print(" D: "fmt, __VA_ARGS__)
 4 | #else
 5 | 	#define LOG_D(fmt, ...) 
 6 | #endif
 7 | 
 8 | #ifdef MOREDEBUG
 9 | 	#define LOG_V(fmt, ...) debug_print(" V: "fmt, __VA_ARGS__)
10 | #else
11 | 	#define LOG_V(fmt, ...) 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/src/hints.h:
--------------------------------------------------------------------------------
 1 | #ifndef HINTS_H
 2 | #define HINTS_H
 3 | #if _XOPEN_SOURCE >= 600 || _POSIX_C_SOURCE >= 200112L
 4 |     #include <fcntl.h>
 5 |     #define SEQUENTIAL_HINT(fd) if (posix_fadvise(fileno(fd), 0, 0, POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE)) { ; }
 6 | #else
 7 |     #define SEQUENTIAL_HINT(fd) 
 8 | #endif
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/test/csv_tokenizer_counts.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <unistd.h>
 3 | #include <stdbool.h>
 4 | #include <assert.h>
 5 | #include <string.h>
 6 | #include "../src/csv_tokenizer.h"
 7 | #include "../src/debug.h"
 8 | 
 9 | #define CELL_BUFFER_SIZE (BUFFER_SIZE / 2) + 2 + 1
10 | struct csv_tokenizer* _tokenizer;
11 | static char _buffer[BUFFER_SIZE + BUFFER_TOKENIZER_POSTFIX];
12 | static Cell _cells[CELL_BUFFER_SIZE];
13 | 
14 | int main(int argc, char** argv) {
15 |     (void)argv;
16 |     if (argc > 1) {
17 |         fprintf(stderr, "This tool is for testing only, pipe a csv into it\n");
18 |         return 0;
19 |     }
20 |     size_t chars_read;
21 |     unsigned long long cell_total = 0;
22 |     _tokenizer = setup_tokenizer(',', _buffer, _cells, CELL_BUFFER_SIZE);
23 |     while ((chars_read = fread(_buffer, 1, BUFFER_SIZE, stdin)) > 0) {
24 |         LOG_D("New data read: %zu\n", chars_read);
25 |         prepare_tokenization(_tokenizer, _buffer, chars_read);
26 |         size_t buffer_consumed = 0;
27 |         size_t cells_found = 0;
28 |         bool last_full = true;
29 | 
30 |         while (buffer_consumed < chars_read) {
31 |             tokenize_cells(_tokenizer, buffer_consumed, chars_read, &buffer_consumed, &cells_found, &last_full);
32 |             LOG_D("Processed: %zu, Cells: %zu\n", buffer_consumed, cells_found);
33 |             cell_total += cells_found;
34 |             if (!last_full) {
35 |                 cell_total--;
36 |             }
37 |         }
38 |     }
39 |     fprintf(stdout, "%llu cells\n", cell_total);
40 |     return 0;
41 | }
42 | 


--------------------------------------------------------------------------------
/test/csvawk/corners_command:
--------------------------------------------------------------------------------
1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }')
2 | 


--------------------------------------------------------------------------------
/test/csvawk/corners_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvawk/corners_output.csv:
--------------------------------------------------------------------------------
1 | abcd""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""


--------------------------------------------------------------------------------
/test/csvawk/large_command:
--------------------------------------------------------------------------------
1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }')
2 | 


--------------------------------------------------------------------------------
/test/csvawk/large_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvawk/large_output.csv:
--------------------------------------------------------------------------------
1 | column1column2column3foo 2 bar"foo
2 | 
3 | " 3 bar"foo ""
4 | "5bar
5 | 


--------------------------------------------------------------------------------
/test/csvawk/simple_command:
--------------------------------------------------------------------------------
1 | ARGS=( 'BEGIN {ORS="\x1E";} { print ; }')
2 | 


--------------------------------------------------------------------------------
/test/csvawk/simple_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvawk/simple_output.csv:
--------------------------------------------------------------------------------
1 | abcde12345234563456745678


--------------------------------------------------------------------------------
/test/csvcut/canada_keep_note_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d Note)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/canada_keep_note_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/csvcut/canada_keep_note_output.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvcut/canada_keep_note_output.csv.xz


--------------------------------------------------------------------------------
/test/csvcut/column_quoted1_command:
--------------------------------------------------------------------------------
1 | ARGS=(-e -k "a a" -k "c,")
2 | 


--------------------------------------------------------------------------------
/test/csvcut/column_quoted1_input.csv:
--------------------------------------------------------------------------------
1 | ../data/quoted_columns.csv


--------------------------------------------------------------------------------
/test/csvcut/column_quoted1_output.csv:
--------------------------------------------------------------------------------
1 | "a a","c,"
2 | 1,3
3 | 2,4
4 | 3,5
5 | 4,6
6 | 


--------------------------------------------------------------------------------
/test/csvcut/corners_drop_ab_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d a,b)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/corners_drop_ab_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvcut/corners_drop_ab_output.csv:
--------------------------------------------------------------------------------
1 | c,d
2 | ,
3 | ,
4 | ,
5 | ,
6 | ,""""""""""""""
7 | 


--------------------------------------------------------------------------------
/test/csvcut/corners_keep_ab_command:
--------------------------------------------------------------------------------
1 | ARGS=(-k a,b)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/corners_keep_ab_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvcut/corners_keep_ab_output.csv:
--------------------------------------------------------------------------------
1 | a,b
2 | ,
3 | ,
4 | ,
5 | """""""""""""",""""""""""""
6 | """""""""""""",""""""""""""
7 | 


--------------------------------------------------------------------------------
/test/csvcut/large_keep_12_command:
--------------------------------------------------------------------------------
1 | ARGS=(-K 1,2)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/large_keep_12_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvcut/large_keep_12_output.csv:
--------------------------------------------------------------------------------
1 | large_keep_col23_output.csv


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col1_command:
--------------------------------------------------------------------------------
1 | ARGS=(-k column1)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col1_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col1_output.csv:
--------------------------------------------------------------------------------
1 | column1
2 | foo
3 | "foo
4 | 
5 | "
6 | "foo ""
7 | "
8 | 


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col23_command:
--------------------------------------------------------------------------------
1 | ARGS=(-k column2,column3)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col23_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvcut/large_keep_col23_output.csv:
--------------------------------------------------------------------------------
1 | column2,column3
2 |  2, bar
3 |  3, bar
4 | 5,bar
5 | 


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names2_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d aaa)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names2_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple_overlapping_columns.csv


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names2_output.csv:
--------------------------------------------------------------------------------
1 | a,aa,aaaa,b
2 | 1,2,4,5
3 | 2,3,5,6
4 | 3,4,6,7
5 | 4,5,7,8
6 | 


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d a)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple_overlapping_columns.csv


--------------------------------------------------------------------------------
/test/csvcut/overlapping_column_names_output.csv:
--------------------------------------------------------------------------------
1 | aa,aaa,aaaa,b
2 | 2,3,4,5
3 | 3,4,5,6
4 | 4,5,6,7
5 | 5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_a_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d a)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_a_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_a_output.csv:
--------------------------------------------------------------------------------
1 | b,c,d,e
2 | 2,3,4,5
3 | 3,4,5,6
4 | 4,5,6,7
5 | 5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_ab_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d a,b)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_ab_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvcut/simple_drop_ab_output.csv:
--------------------------------------------------------------------------------
1 | c,d,e
2 | 3,4,5
3 | 4,5,6
4 | 5,6,7
5 | 6,7,8
6 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ab_command:
--------------------------------------------------------------------------------
1 | ARGS=(-k a,b)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ab_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ab_output.csv:
--------------------------------------------------------------------------------
1 | a,b
2 | 1,2
3 | 2,3
4 | 3,4
5 | 4,5
6 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ae_command:
--------------------------------------------------------------------------------
1 | ARGS=(-k a,e)
2 | 


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ae_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvcut/simple_keep_ae_output.csv:
--------------------------------------------------------------------------------
1 | a,e
2 | 1,5
3 | 2,6
4 | 3,7
5 | 4,8
6 | 


--------------------------------------------------------------------------------
/test/csvgrep/char_range_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p Topic/[A-Z][a-e]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/char_range_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/csvgrep/char_range_output.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvgrep/char_range_output.csv.xz


--------------------------------------------------------------------------------
/test/csvgrep/empty_cell_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a/^$/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/empty_cell_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvgrep/empty_cell_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d
2 | ,,,
3 | ,,,
4 | ,,,
5 | 


--------------------------------------------------------------------------------
/test/csvgrep/integer_range_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p Characteristic/201[0-2]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/integer_range_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/csvgrep/integer_range_output.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvgrep/integer_range_output.csv.xz


--------------------------------------------------------------------------------
/test/csvgrep/not_option-text_command:
--------------------------------------------------------------------------------
1 | ARGS=(-v -p "column3/(foo|bar)/")
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/not_option-text_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvgrep/not_option-text_output.csv:
--------------------------------------------------------------------------------
1 | column1,column2,column3
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/not_quoted_cell_command:
--------------------------------------------------------------------------------
1 | ARGS=(-v -p a/[\"]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/not_quoted_cell_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvgrep/not_quoted_cell_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d
2 | ,,,
3 | ,,,
4 | ,,,
5 | 


--------------------------------------------------------------------------------
/test/csvgrep/one_field_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a/[0-9]+/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/one_field_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvgrep/one_field_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d,e
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 3,4,5,6,7
5 | 4,5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/csvgrep/option-text_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p 'column1/(foo|bar)/')
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/option-text_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvgrep/option-text_output.csv:
--------------------------------------------------------------------------------
1 | column1,column2,column3
2 | foo, 2, bar
3 | "foo
4 | 
5 | ", 3, bar
6 | "foo ""
7 | ",5,bar
8 | 


--------------------------------------------------------------------------------
/test/csvgrep/option2-text_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p 'column3/(foo|bar)/')
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/option2-text_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvgrep/option2-text_output.csv:
--------------------------------------------------------------------------------
1 | column1,column2,column3
2 | foo, 2, bar
3 | "foo
4 | 
5 | ", 3, bar
6 | "foo ""
7 | ",5,bar
8 | 


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns1_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p aa/[1-2]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns1_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple_overlapping_columns.csv


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns1_output.csv:
--------------------------------------------------------------------------------
1 | a,aa,aaa,aaaa,b
2 | 1,2,3,4,5
3 | 


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns2_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p aaa/[3-4]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns2_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple_overlapping_columns.csv


--------------------------------------------------------------------------------
/test/csvgrep/overlapping_columns2_output.csv:
--------------------------------------------------------------------------------
1 | a,aa,aaa,aaaa,b
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 


--------------------------------------------------------------------------------
/test/csvgrep/quoted_cell_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a/[\"]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/quoted_cell_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvgrep/quoted_cell_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d
2 | """""""""""""","""""""""""",,
3 | """""""""""""","""""""""""",,""""""""""""""
4 | 


--------------------------------------------------------------------------------
/test/csvgrep/two_NOT_field_command:
--------------------------------------------------------------------------------
1 | ARGS=(-v -p a/[1-2]/ -p b/[2-3]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/two_NOT_field_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvgrep/two_NOT_field_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d,e
2 | 3,4,5,6,7
3 | 4,5,6,7,8
4 | 


--------------------------------------------------------------------------------
/test/csvgrep/two_field_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a/[1-2]/ -p b/[2-3]/)
2 | 


--------------------------------------------------------------------------------
/test/csvgrep/two_field_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvgrep/two_field_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d,e
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 


--------------------------------------------------------------------------------
/test/csvpipe/canada_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/canada_command


--------------------------------------------------------------------------------
/test/csvpipe/canada_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/csvpipe/canada_output.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/canada_output.csv.xz


--------------------------------------------------------------------------------
/test/csvpipe/corners_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/corners_command


--------------------------------------------------------------------------------
/test/csvpipe/corners_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvpipe/corners_output.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d ,,, ,,, ,,, """""""""""""","""""""""""",, """""""""""""","""""""""""",,"""""""""""""" 


--------------------------------------------------------------------------------
/test/csvpipe/drop_header_command:
--------------------------------------------------------------------------------
1 | ARGS=(-d)
2 | 


--------------------------------------------------------------------------------
/test/csvpipe/drop_header_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvpipe/drop_header_output.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4,5 2,3,4,5,6 3,4,5,6,7 4,5,6,7,8 
2 | 


--------------------------------------------------------------------------------
/test/csvpipe/large-fields_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/large-fields_command


--------------------------------------------------------------------------------
/test/csvpipe/large-fields_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvpipe/large-fields_output.csv:
--------------------------------------------------------------------------------
1 | foo, 2, bar "foo
2 | 
3 | ", 3, bar "foo ""
4 | ",5,bar 
5 | 


--------------------------------------------------------------------------------
/test/csvpipe/simple_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvpipe/simple_command


--------------------------------------------------------------------------------
/test/csvpipe/simple_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvpipe/simple_output.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4,5 2,3,4,5,6 3,4,5,6,7 4,5,6,7,8 
2 | 


--------------------------------------------------------------------------------
/test/csvtokenizercounts/canada_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/canada_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/canada_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/csvtokenizercounts/canada_output.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/canada_output.csv.xz


--------------------------------------------------------------------------------
/test/csvtokenizercounts/corners_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/corners_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/corners_input.csv:
--------------------------------------------------------------------------------
1 | ../data/corners.csv


--------------------------------------------------------------------------------
/test/csvtokenizercounts/corners_output.csv:
--------------------------------------------------------------------------------
1 | 30 cells
2 | 


--------------------------------------------------------------------------------
/test/csvtokenizercounts/large-fields_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/large-fields_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/large-fields_input.csv:
--------------------------------------------------------------------------------
1 | ../data/large-fields.csv


--------------------------------------------------------------------------------
/test/csvtokenizercounts/large-fields_output.csv:
--------------------------------------------------------------------------------
1 | 16 cells
2 | 


--------------------------------------------------------------------------------
/test/csvtokenizercounts/quoted_columns_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/quoted_columns_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/quoted_columns_input.csv:
--------------------------------------------------------------------------------
1 | ../data/quoted_columns.csv


--------------------------------------------------------------------------------
/test/csvtokenizercounts/quoted_columns_output.csv:
--------------------------------------------------------------------------------
1 | 30 cells
2 | 


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/simple_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple.csv


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_output.csv:
--------------------------------------------------------------------------------
1 | 30 cells
2 | 


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_overlapping_columns_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvtokenizercounts/simple_overlapping_columns_command


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_overlapping_columns_input.csv:
--------------------------------------------------------------------------------
1 | ../data/simple_overlapping_columns.csv


--------------------------------------------------------------------------------
/test/csvtokenizercounts/simple_overlapping_columns_output.csv:
--------------------------------------------------------------------------------
1 | 30 cells
2 | 


--------------------------------------------------------------------------------
/test/csvunpipe/canada_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvunpipe/canada_command


--------------------------------------------------------------------------------
/test/csvunpipe/canada_input.csv.xz:
--------------------------------------------------------------------------------
1 | ../csvpipe/canada_output.csv.xz


--------------------------------------------------------------------------------
/test/csvunpipe/canada_output.csv.xz:
--------------------------------------------------------------------------------
1 | ../csvpipe/canada_input.csv.xz


--------------------------------------------------------------------------------
/test/csvunpipe/corners_command:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/csvunpipe/corners_command


--------------------------------------------------------------------------------
/test/csvunpipe/corners_input.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/corners_output.csv


--------------------------------------------------------------------------------
/test/csvunpipe/corners_output.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/corners_input.csv


--------------------------------------------------------------------------------
/test/csvunpipe/drop_header_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a,b,c,d,e)
2 | 


--------------------------------------------------------------------------------
/test/csvunpipe/drop_header_input.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/drop_header_output.csv


--------------------------------------------------------------------------------
/test/csvunpipe/drop_header_output.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/drop_header_input.csv


--------------------------------------------------------------------------------
/test/csvunpipe/large-fields_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p column1,column2,column3)
2 | 


--------------------------------------------------------------------------------
/test/csvunpipe/large-fields_input.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/large-fields_output.csv


--------------------------------------------------------------------------------
/test/csvunpipe/large-fields_output.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/large-fields_input.csv


--------------------------------------------------------------------------------
/test/csvunpipe/simple_command:
--------------------------------------------------------------------------------
1 | ARGS=(-p a,b,c,d,e)
2 | 


--------------------------------------------------------------------------------
/test/csvunpipe/simple_input.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/simple_output.csv


--------------------------------------------------------------------------------
/test/csvunpipe/simple_output.csv:
--------------------------------------------------------------------------------
1 | ../csvpipe/simple_input.csv


--------------------------------------------------------------------------------
/test/data/canada-2011-census.csv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DavyLandman/csvtools/0162d828ec7500cf01080f73fd28387a9cdada92/test/data/canada-2011-census.csv.xz


--------------------------------------------------------------------------------
/test/data/corners.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d
2 | ,,,
3 | ,,,
4 | ,,,
5 | """""""""""""","""""""""""",,
6 | """""""""""""","""""""""""",,""""""""""""""
7 | 


--------------------------------------------------------------------------------
/test/data/large-fields.csv:
--------------------------------------------------------------------------------
1 | column1,column2,column3
2 | foo, 2, bar
3 | "foo
4 | 
5 | ", 3, bar
6 | "foo ""
7 | ",5,bar
8 | 


--------------------------------------------------------------------------------
/test/data/quoted_columns.csv:
--------------------------------------------------------------------------------
1 | "a a","b","c,","  d","e"
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 3,4,5,6,7
5 | 4,5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/data/simple.csv:
--------------------------------------------------------------------------------
1 | a,b,c,d,e
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 3,4,5,6,7
5 | 4,5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/data/simple_overlapping_columns.csv:
--------------------------------------------------------------------------------
1 | a,aa,aaa,aaaa,b
2 | 1,2,3,4,5
3 | 2,3,4,5,6
4 | 3,4,5,6,7
5 | 4,5,6,7,8
6 | 


--------------------------------------------------------------------------------
/test/runtest.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | PROGRAM=$1
 3 | LARGE_FILES=$2
 4 | RESULT=0
 5 | 
 6 | 
 7 | test_normal() {
 8 |     REF_FILE=$OUTPUT
 9 | 	REF=$(cat "$OUTPUT")
10 |     OUTPUT=$("../bin/$PROGRAM" "${ARGS[@]}" < "$INPUT")
11 | 	if (($? > 0)); then
12 |         printf "\t- %s params: \"%s\" = \t Failed (%s crashed)\n" "$INPUT" "${ARGS[*]}" "$PROGRAM"
13 | 		RESULT=1
14 | 		return
15 | 	fi
16 | 
17 | 
18 | 	if [ "$OUTPUT" != "$REF" ]; then
19 |         printf "\t- %s params: \"%s\" = \t Failed\n" "$INPUT" "${ARGS[*]}"
20 | 		printf "$OUTPUT" > /tmp/error-output.csv
21 |         diff -a -d "$REF_FILE" /tmp/error-output.csv
22 | 		rm /tmp/error-output.csv
23 | 		printf ""
24 | 		RESULT=1
25 | 	else
26 |         printf "\t- %s params: \"%s\" = \t OK\n" "$INPUT" "${ARGS[*]}" 
27 | 	fi
28 | }
29 | 
30 | test_xz() {
31 | 	REF=$(xzcat "$OUTPUT" | openssl md5)
32 | 	OUTPUT=$(xzcat "$INPUT" | "../bin/$PROGRAM" "${ARGS[@]}" | openssl md5)
33 | 	if (($? > 0)); then
34 |         printf "\t- %s params: \"%s\" = \t Failed (%s crashed)\n" "$INPUT" "${ARGS[*]}" "$PROGRAM"
35 | 		RESULT=1
36 | 		return
37 | 	fi
38 | 
39 | 
40 | 	if [ "$OUTPUT" != "$REF" ]; then
41 |         printf "\t- %s params: \"%s\" = \t Failed\n" "$INPUT" "${ARGS[*]}"
42 | 		RESULT=1
43 | 	else
44 |         printf "\t- %s params: \"%s\" = \t OK\n" "$INPUT" "${ARGS[*]}" 
45 | 	fi
46 | }
47 | printf "Testing $PROGRAM"
48 | OUTPUT=$("../bin/$PROGRAM" -h 2>&1 | wc -l)
49 | if (($? > 0)) || [ $OUTPUT -lt 1 ]; then
50 |     printf "\t- %s has no help params" "$PROGRAM"
51 |     RESULT=1
52 | fi
53 | 
54 | for INPUT in $PROGRAM/*_input.csv*;
55 | do
56 |     source "$(printf $INPUT | sed 's/input\.csv.*$/command/')"
57 | 	#ARGS=$(cat "$(printf $INPUT | sed 's/input\.csv.*$/command/')")
58 | 	OUTPUT=$(printf $INPUT | sed 's/input/output/')
59 | 	case $INPUT in
60 | 		*.csv.xz )
61 | 			if (($LARGE_FILES == 1)); then
62 | 				test_xz
63 | 			fi
64 | 		;;
65 | 		*.csv )
66 | 			test_normal
67 | 		;;
68 | 	esac
69 | done
70 | if [ $RESULT == 0 ]; then
71 | 	printf "Tests succeeded\n"
72 | else
73 | 	printf "Tests failed\n"
74 | fi
75 | exit $RESULT
76 | 


--------------------------------------------------------------------------------
/test/test-sizes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # run from root dir!
 3 | 
 4 | 
 5 | EXTRA_FLAGS=""
 6 | if [ "$#" -ne 0 ]; then
 7 |     EXTRA_FLAGS="$@"
 8 | fi
 9 | 
10 | set -e
11 | 
12 | #silent function from https://serverfault.com/questions/607884
13 | SILENT_LOG=/tmp/silent_log_$$.txt
14 | trap "/bin/rm -f $SILENT_LOG" EXIT
15 | 
16 | report_and_exit() {
17 |     cat "${SILENT_LOG}"
18 |     exit 1
19 | }
20 | 
21 | silent() {
22 | 	`rm -f ${SILENT_LOG}`
23 |     $* 2>>"${SILENT_LOG}" >> "${SILENT_LOG}" || report_and_exit;
24 | }
25 | 
26 | test_with_size() {
27 | 	if (($1 > 30)); then
28 | 		if (($1 > 72)) ; then # csvcut has to read the full header
29 | 			if (($1 > 145)); then # csvgrep has to fit the max line length in 2*BUFFER_SIZE
30 | 				make test-csvcut test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS
31 | 			else
32 | 				make test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g SKIP_LARGE_FILES=1 $EXTRA_FLAGS
33 | 				if (($? > 0)); then
34 |     				echo "\033[91mFailure with size $1\033[39m"
35 | 					return 1
36 | 				fi
37 | 				make test-csvcut BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS
38 | 			fi
39 | 		else
40 | 			make test-csvcut test-csvgrep BUFFER_SIZE=$1 DISABLE_ASSERTS=-g SKIP_LARGE_FILES=1 $EXTRA_FLAGS
41 | 		fi
42 | 	fi
43 | 	if (($? > 0)); then
44 |     	echo "\033[91mFailure with size $1\033[39m"
45 | 		return 1
46 | 	fi
47 | 	make test-csvpipe test-csvunpipe test-csvpipe test-csvunpipe test-csvawk test-tokenizer BUFFER_SIZE=$1 DISABLE_ASSERTS=-g $EXTRA_FLAGS
48 | 	if (($? > 0)); then
49 |     	echo "\033[91mFailure with size $1\033[39m"
50 | 		return 1
51 | 	fi
52 | 	return 0
53 | }
54 | 
55 | echo "Testing predefined sizes"
56 | for s in 1 2 3 4 5 6 7 8 11 16 21 24 32 36 63 128 1024;
57 | 	do
58 | 		silent "make deep-clean"
59 | 		echo "Testing size: \t $s"
60 | 		silent test_with_size $s
61 | 	done
62 | 
63 | echo "Trying 40 random sizes"
64 | for x in $(seq 1 40);
65 | 	do
66 | 		silent "make deep-clean"
67 |         RANDOMNUM=$( head -200 /dev/urandom | cksum | cut -f1 -d " ")
68 | 		s=$(( ( RANDOMNUM % 400 )  + 1 ));
69 |         echo "Testing size: \t $s (run $x/40)"
70 | 		silent test_with_size $s
71 | 	done
72 | 


--------------------------------------------------------------------------------