├── .gitignore ├── resources ├── run.sh └── run_python.sh ├── Python ├── Dockerfile.python2 ├── Dockerfile.python3 ├── csv_test.py └── csv_test_3.py ├── .travis.yml ├── Go ├── Dockerfile └── csvtest.go ├── D ├── Dockerfile.ldc ├── Dockerfile.dmd ├── csv_test.d └── fastcsv.d ├── Nim ├── Dockerfile └── csv_test.nim ├── C ├── Dockerfile.gcc ├── Dockerfile.clang └── csv_test.c ├── README.md └── Makefile /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | data/ngrams.tsv 4 | output/* -------------------------------------------------------------------------------- /resources/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ( time /app/csv_test /data/ngrams.tsv 1 2 ) > /output/$1 2>&1 -------------------------------------------------------------------------------- /resources/run_python.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ( time python /app/csv_test.py /data/ngrams.tsv 1 2 ) > /output/$1 2>&1 -------------------------------------------------------------------------------- /Python/Dockerfile.python2: -------------------------------------------------------------------------------- 1 | # runtime image 2 | 3 | FROM python:2-alpine 4 | 5 | WORKDIR /app 6 | 7 | COPY csv_test.py . 8 | 9 | VOLUME ['/data', '/output', '/resources'] 10 | 11 | RUN apk add --no-cache bash 12 | 13 | CMD /resources/run_python.sh python2.txt -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | services: 4 | - docker 5 | 6 | script: 7 | - make run 8 | 9 | deploy: 10 | provider: pages 11 | skip-cleanup: true 12 | github-token: $GITHUB_TOKEN 13 | keep-history: true 14 | local-dir: output 15 | target-branch: gh-pages -------------------------------------------------------------------------------- /Python/Dockerfile.python3: -------------------------------------------------------------------------------- 1 | # runtime image 2 | 3 | FROM python:3-alpine 4 | 5 | WORKDIR /app 6 | 7 | COPY csv_test_3.py ./csv_test.py 8 | 9 | VOLUME ['/data', '/output', '/resources'] 10 | 11 | RUN apk add --no-cache bash 12 | 13 | CMD /resources/run_python.sh python3.txt -------------------------------------------------------------------------------- /Go/Dockerfile: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM golang:alpine AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY csvtest.go . 8 | 9 | RUN go build -o csv_test ./csvtest.go 10 | 11 | # runtime image 12 | 13 | FROM alpine:latest 14 | 15 | WORKDIR /app 16 | 17 | COPY --from=build-env /app/csv_test . 18 | 19 | VOLUME ['/data', '/output', '/resources'] 20 | 21 | RUN apk add --no-cache bash 22 | 23 | CMD /resources/run.sh go.txt -------------------------------------------------------------------------------- /D/Dockerfile.ldc: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM dlanguage/ldc AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY *.d ./ 8 | 9 | RUN ldc2 -of=./csv_test -O -release -boundscheck=off ./csv_test.d ./fastcsv.d 10 | 11 | # runtime image 12 | 13 | FROM ubuntu:16.04 14 | 15 | WORKDIR /app 16 | 17 | COPY --from=build-env /app/csv_test . 18 | 19 | VOLUME ['/data', '/output', '/resources'] 20 | 21 | CMD /resources/run.sh d_ldc.txt 22 | -------------------------------------------------------------------------------- /D/Dockerfile.dmd: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM dlanguage/dmd AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY *.d ./ 8 | 9 | RUN dmd -O -release -inline -boundscheck=off -of=./csv_test ./csv_test.d ./fastcsv.d 10 | 11 | # runtime image 12 | 13 | FROM ubuntu:16.04 14 | 15 | WORKDIR /app 16 | 17 | COPY --from=build-env /app/csv_test . 18 | 19 | VOLUME ['/data', '/output', '/resources'] 20 | 21 | CMD /resources/run.sh d_dmd.txt 22 | -------------------------------------------------------------------------------- /Nim/Dockerfile: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM nimlang/nim:latest-alpine AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY csv_test.nim . 8 | 9 | RUN nim c -d:release -o:csv_test csv_test.nim 10 | 11 | # runtime image 12 | 13 | FROM alpine:latest 14 | 15 | WORKDIR /app 16 | 17 | COPY --from=build-env /app/csv_test . 18 | 19 | VOLUME ['/data', '/output', '/resources'] 20 | 21 | RUN apk add --no-cache bash 22 | 23 | CMD /resources/run.sh nim.txt 24 | -------------------------------------------------------------------------------- /C/Dockerfile.gcc: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM alpine:latest AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY csv_test.c . 8 | 9 | RUN apk add --no-cache gcc musl-dev 10 | 11 | RUN gcc -O3 -g -Wall -o csv_test csv_test.c 12 | 13 | # runtime image 14 | 15 | FROM alpine:latest 16 | 17 | WORKDIR /app 18 | 19 | COPY --from=build-env /app/csv_test . 20 | 21 | VOLUME ['/data', '/output', '/resources'] 22 | 23 | RUN apk add --no-cache bash 24 | 25 | CMD /resources/run.sh c_gcc.txt -------------------------------------------------------------------------------- /C/Dockerfile.clang: -------------------------------------------------------------------------------- 1 | # build image 2 | 3 | FROM alpine:latest AS build-env 4 | 5 | WORKDIR /app 6 | 7 | COPY csv_test.c . 8 | 9 | RUN apk add --no-cache clang musl-dev alpine-sdk 10 | 11 | RUN clang -O3 -g -Wall -o csv_test csv_test.c 12 | 13 | # runtime image 14 | 15 | FROM alpine:latest 16 | 17 | WORKDIR /app 18 | 19 | COPY --from=build-env /app/csv_test . 20 | 21 | VOLUME ['/data', '/output', '/resources'] 22 | 23 | RUN apk add --no-cache bash 24 | 25 | CMD /resources/run.sh c_clang.txt -------------------------------------------------------------------------------- /Python/csv_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import fileinput 5 | import collections 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser(description='Sum a column.') 9 | parser.add_argument('file', type=open) 10 | parser.add_argument('key_field_index', type=int) 11 | parser.add_argument('value_field_index', type=int) 12 | 13 | args = parser.parse_args() 14 | delim = '\t' 15 | 16 | max_field_index = max(args.key_field_index, args.value_field_index) 17 | sum_by_key = collections.Counter() 18 | 19 | for line in args.file: 20 | fields = line.rstrip('\n').split(delim) 21 | if max_field_index < len(fields): 22 | sum_by_key[fields[args.key_field_index]] += int(fields[args.value_field_index]) 23 | 24 | max_entry = sum_by_key.most_common(1); 25 | if len(max_entry) == 0: 26 | print 'No entries' 27 | else: 28 | print 'max_key:', max_entry[0][0], 'sum:', max_entry[0][1] 29 | 30 | if __name__ == '__main__': 31 | main() -------------------------------------------------------------------------------- /Python/csv_test_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import fileinput 5 | import collections 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser(description='Sum a column.') 9 | parser.add_argument('file', type=open) 10 | parser.add_argument('key_field_index', type=int) 11 | parser.add_argument('value_field_index', type=int) 12 | 13 | args = parser.parse_args() 14 | delim = '\t' 15 | 16 | max_field_index = max(args.key_field_index, args.value_field_index) 17 | sum_by_key = collections.Counter() 18 | 19 | for line in args.file: 20 | fields = line.rstrip('\n').split(delim) 21 | if max_field_index < len(fields): 22 | sum_by_key[fields[args.key_field_index]] += int(fields[args.value_field_index]) 23 | 24 | max_entry = sum_by_key.most_common(1); 25 | if len(max_entry) == 0: 26 | print('No entries') 27 | else: 28 | print('max_key:', max_entry[0][0], 'sum:', max_entry[0][1]) 29 | 30 | if __name__ == '__main__': 31 | main() -------------------------------------------------------------------------------- /D/csv_test.d: -------------------------------------------------------------------------------- 1 | import std.algorithm, std.conv, std.file, std.stdio; 2 | import fastcsv; 3 | 4 | const delim = '\t'; 5 | 6 | int main(string[] args) { 7 | if (args.length < 4) { 8 | writeln("synopsis: ", args[0], " filename keyfield valuefield"); 9 | return 1; 10 | } 11 | 12 | const filename = args[1], 13 | keyFieldIndex = args[2].to!size_t, 14 | valueFieldIndex = args[3].to!size_t, 15 | maxFieldIndex = max(keyFieldIndex, valueFieldIndex); 16 | const file = cast(string) read(filename); 17 | long[string] sumByKey; 18 | 19 | foreach(record; file.csvByRecord!(delim)) { 20 | if (record.length > maxFieldIndex) 21 | sumByKey[record[keyFieldIndex]] += record[valueFieldIndex].to!long; 22 | } 23 | 24 | if (sumByKey.length == 0) { 25 | writeln("No entries"); 26 | } 27 | else { 28 | const maxEntry = sumByKey.byKeyValue.maxElement!"a.value"; 29 | writeln("max_key: ", maxEntry.key, " sum: ", maxEntry.value); 30 | } 31 | 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /Nim/csv_test.nim: -------------------------------------------------------------------------------- 1 | import os, strutils, streams, tables, parsecsv 2 | 3 | const 4 | Delim = '\t' 5 | 6 | proc main() = 7 | if paramCount() < 3: 8 | quit("synopsis: " & getAppFilename() & " filename keyfield valuefield") 9 | 10 | let 11 | filename = paramStr(1) 12 | keyFieldIndex = parseInt(paramStr(2)) 13 | valueFieldIndex = parseInt(paramStr(3)) 14 | maxFieldIndex = max(keyFieldIndex, valueFieldIndex) 15 | 16 | var 17 | sumByKey = newCountTable[string]() 18 | file = newFileStream(filename, fmRead) 19 | 20 | if file == nil: 21 | quit("cannot open the file " & filename) 22 | 23 | defer: file.close() 24 | 25 | var csv: CsvParser 26 | open(csv, file, filename, separator=Delim) 27 | 28 | while csv.readRow(): 29 | if len(csv.row) > maxFieldIndex: 30 | sumByKey.inc(csv.row[keyFieldIndex], parseInt(csv.row[valueFieldIndex])) 31 | 32 | if sumByKey.len() == 0: 33 | echo "No entries" 34 | else: 35 | let largest = sumByKey.largest() 36 | echo "max_key: ", largest[0], " sum: ", largest[1] 37 | 38 | main() 39 | -------------------------------------------------------------------------------- /Go/csvtest.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "math" 8 | "os" 9 | "strconv" 10 | "strings" 11 | ) 12 | 13 | func maxEntry(dict map[int]int) (key, value int) { 14 | mk := 0 15 | mv := 0 16 | 17 | for k, v := range dict { 18 | if mv < v { 19 | mv = v 20 | mk = k 21 | } 22 | } 23 | return mk, mv 24 | } 25 | 26 | func main() { 27 | sumByKey := make(map[int]int) 28 | delim := "\t" 29 | 30 | if len(os.Args) < 3 { 31 | fmt.Println("synopsis: csvtest filename keyfield valuefield") 32 | os.Exit(1) 33 | } 34 | 35 | filename := os.Args[1] 36 | keyFieldIndex, _ := strconv.Atoi(os.Args[2]) 37 | valueFieldIndex, _ := strconv.Atoi(os.Args[3]) 38 | maxFieldIndex := int(math.Max(float64(keyFieldIndex), 39 | float64(valueFieldIndex))) 40 | file, err := os.Open(filename) 41 | if err != nil { 42 | fmt.Println(err) 43 | os.Exit(1) 44 | } 45 | 46 | reader := bufio.NewReader(file) 47 | 48 | for { 49 | line, err := reader.ReadString('\n') 50 | if err == io.EOF { 51 | break 52 | } 53 | record := strings.Split(line, delim) 54 | if maxFieldIndex < len(record) { 55 | value, _ := strconv.Atoi(record[valueFieldIndex]) 56 | key, _ := strconv.Atoi(record[keyFieldIndex]) 57 | sumByKey[key] += value 58 | } 59 | } 60 | 61 | if len(sumByKey) == 0 { 62 | fmt.Println("No entries") 63 | } else { 64 | maxKey, maxValue := maxEntry(sumByKey) 65 | fmt.Println("max_key:", maxKey, "sum:", maxValue) 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Faster Command Line Tools in Nim 2 | 3 | This is a project to test how Nim compares to D in one very specific scenario. It was inspired by the [`Faster Command line Tools in D` blog post](http://dlang.org/blog/2017/05/24/faster-command-line-tools-in-d/). 4 | 5 | There's a full blog post explaining the reasoning and with some basic results form my system [available here](https://www.euantorano.co.uk/posts/faster-command-line-tools-in-nim/). 6 | 7 | ## Running the tests 8 | 9 | All versions are built and ran using Docker. 10 | 11 | To run every version and dump results into the `output` folder, you can use the `run` Make target: 12 | 13 | ``` 14 | make run 15 | ``` 16 | 17 | You can also run individual targets if you're working on improving a target: 18 | 19 | - **C**: `make c_run` 20 | - **D**: `make d_run` 21 | - **Go**: `make go_run` 22 | - **Nim**: `make nim_run` 23 | - **Python**: `make python_run` 24 | 25 | This will download the `resources/ngrams.tsv` if it doesn't already exist 26 | 27 | ## Results 28 | 29 | This repository is built by Travis for every push or PR. Results are published to the `gh-pages` branch: 30 | 31 | - [C (GCC)](https://euantorano.github.io/faster-command-line-tools-in-nim/c_gcc.txt) 32 | - [C (clang)](https://euantorano.github.io/faster-command-line-tools-in-nim/c_clang.txt) 33 | - [D (DMD)](https://euantorano.github.io/faster-command-line-tools-in-nim/d_dmd.txt) 34 | - [D (LDC)](https://euantorano.github.io/faster-command-line-tools-in-nim/d_ldc.txt) 35 | - [Go](https://euantorano.github.io/faster-command-line-tools-in-nim/go.txt) 36 | - [Nim](https://euantorano.github.io/faster-command-line-tools-in-nim/nim.txt) 37 | - [Python 2](https://euantorano.github.io/faster-command-line-tools-in-nim/python2.txt) 38 | - [Python 3](https://euantorano.github.io/faster-command-line-tools-in-nim/python3.txt) 39 | 40 | ## TODO 41 | 42 | - [ ] Build an overall results file which will be published to GitHub pages - possibly include graphs in this file? 43 | - [ ] Run each version multiple times and take the average run times 44 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CURRENT_DIR=$(shell pwd) 2 | 3 | c_image: 4 | @docker build -f ./C/Dockerfile.gcc -t csv_test_c_gcc ./C/ 5 | @docker build -f ./C/Dockerfile.clang -t csv_test_c_clang ./C/ 6 | 7 | d_image: 8 | @docker build -f ./D/Dockerfile.dmd -t csv_test_d_dmd ./D/ 9 | @docker build -f ./D/Dockerfile.ldc -t csv_test_d_ldc ./D/ 10 | 11 | go_image: 12 | @docker build -t csv_test_go ./Go/ 13 | 14 | nim_image: 15 | @docker build -t csv_test_nim ./Nim/ 16 | 17 | python_image: 18 | @docker build -f ./Python/Dockerfile.python2 -t csv_test_python2 ./Python/ 19 | @docker build -f ./Python/Dockerfile.python3 -t csv_test_python3 ./Python/ 20 | 21 | build: c_image d_image go_image nim_image python_image 22 | 23 | data/ngrams.tsv: 24 | @mkdir -p data 25 | @curl --output ./data/ngrams.gz https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-0.gz 26 | @gunzip ./data/ngrams.gz 27 | @mv ./data/ngrams ./data/ngrams.tsv 28 | 29 | c_run: c_image data/ngrams.tsv 30 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_c_gcc 31 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_c_clang 32 | 33 | d_run: d_image data/ngrams.tsv 34 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_d_dmd 35 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_d_ldc 36 | 37 | go_run: go_image data/ngrams.tsv 38 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_go 39 | 40 | nim_run: nim_image data/ngrams.tsv 41 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_nim 42 | 43 | python_run: python_image data/ngrams.tsv 44 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_python2 45 | @docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_python3 46 | 47 | run: c_run d_run go_run nim_run python_run 48 | 49 | clean: 50 | @rm -Rf data 51 | @rm -Rf output 52 | @docker image rm csv_test_c_gcc csv_test_c_clang csv_test_d_dmd csv_test_d_ldc csv_test_go csv_test_nim csv_test_python2 csv_test_python3 2>/dev/null; true 53 | 54 | .PHONY: c_image d_image go_image nim_image build clean -------------------------------------------------------------------------------- /C/csv_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | enum { 15 | NWorkers = 14, 16 | NBucket = 2048, 17 | }; 18 | 19 | typedef struct Bucket Bucket; 20 | typedef struct Worker Worker; 21 | 22 | struct Bucket { 23 | char **key; 24 | int *count; 25 | size_t size; 26 | }; 27 | 28 | struct Worker { 29 | pthread_t thread; 30 | size_t beg, end; 31 | Bucket hash[NBucket]; 32 | }; 33 | 34 | Worker w[NWorkers]; 35 | char *data; 36 | 37 | void 38 | die(char *msg) 39 | { 40 | fputs(msg, stderr); 41 | exit(1); 42 | } 43 | 44 | int 45 | addcount(Bucket *b, char *key, int count) 46 | { 47 | size_t n; 48 | 49 | for (n=0; nsize; n++) 50 | if (strcmp(key, b->key[n]) == 0) 51 | return b->count[n] += count; 52 | b->size++; 53 | b->key = realloc(b->key, b->size); 54 | b->count = realloc(b->count, b->size); 55 | b->key[n] = key; 56 | return b->count[n] = count; 57 | } 58 | 59 | void * 60 | worker(void *p) 61 | { 62 | char *key, *cur, *endp; 63 | size_t beg, end; 64 | int count; 65 | uint32_t h; 66 | Bucket *hash; 67 | Worker *pw; 68 | 69 | pw = p; 70 | hash = pw->hash; 71 | beg = pw->beg; 72 | end = pw->end; 73 | 74 | cur = &data[beg]; 75 | endp = &data[end]; 76 | 77 | do { 78 | while (*cur != '\t') 79 | cur++; 80 | key = ++cur; 81 | h = 0; 82 | while (*cur != '\t') { 83 | h = h*33 + *cur; 84 | cur++; 85 | } 86 | *cur++ = 0; 87 | count = 0; 88 | while (*cur != '\t') { 89 | count = count * 10 + (*cur - '0'); 90 | cur++; 91 | } 92 | addcount(&hash[h & (NBucket-1)], key, count); 93 | cur = memchr(cur, '\n', endp-cur); 94 | } while (cur && ++cur != endp); 95 | 96 | // printf("Done %zd - %zd\n", beg, end); 97 | return 0; 98 | } 99 | 100 | int 101 | main(int ac, char *av[]) 102 | { 103 | struct stat s; 104 | int fd, max, cnt; 105 | size_t cur, chunk, fsz, n, i; 106 | char *nl, *maxk; 107 | Worker *pw; 108 | Bucket *b; 109 | 110 | if (ac < 2) 111 | die("no argument\n"); 112 | 113 | fd = open(av[1], O_RDONLY); 114 | if (fd == -1) 115 | die("cannot open file\n"); 116 | if (fstat(fd, &s)) 117 | die("cannot stat file\n"); 118 | fsz = s.st_size; 119 | data = mmap(0, fsz, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); 120 | if (data == MAP_FAILED) 121 | die("cannot mmap file\n"); 122 | 123 | /* start workers */ 124 | chunk = fsz / NWorkers; 125 | cur = 0; 126 | for (pw=w; pw<&w[NWorkers]; pw++) { 127 | pw->beg = cur; 128 | cur += chunk; 129 | if (cur > fsz) 130 | cur = fsz; 131 | nl = memchr(&data[cur-1], '\n', fsz-cur+1); 132 | if (nl) 133 | cur = nl - data + 1; 134 | pw->end = cur; 135 | pthread_create(&pw->thread, 0, worker, pw); 136 | } 137 | assert(cur == fsz); 138 | 139 | /* wait for all threads to be done */ 140 | for (pw=w; pw<&w[NWorkers]; pw++) 141 | pthread_join(pw->thread, 0); 142 | 143 | max = 0; 144 | maxk = "oops"; 145 | /* aggregate results */ 146 | for (pw=&w[1]; pw<&w[NWorkers]; pw++) { 147 | for (n=0; nhash[n]; 149 | for (i=0; isize; i++) { 150 | cnt = addcount(&w[0].hash[n], b->key[i], b->count[i]); 151 | if (cnt > max) { 152 | max = cnt; 153 | maxk = b->key[i]; 154 | } 155 | } 156 | } 157 | } 158 | 159 | printf("max_key: %s sum: %d\n", maxk, max); 160 | return 0; 161 | } 162 | -------------------------------------------------------------------------------- /D/fastcsv.d: -------------------------------------------------------------------------------- 1 | /** 2 | * Experimental fast CSV reader. 3 | * 4 | * Based on RFC 4180. 5 | */ 6 | module fastcsv; 7 | 8 | /** 9 | * Reads CSV data from the given filename. 10 | */ 11 | auto csvFromUtf8File(string filename) 12 | { 13 | import std.file : read; 14 | return csvToArray(cast(string) read(filename)); 15 | } 16 | 17 | private char[] filterQuotes(dchar quote)(const(char)[] str) pure 18 | { 19 | auto buf = new char[str.length]; 20 | size_t j = 0; 21 | for (size_t i = 0; i < str.length; i++) 22 | { 23 | if (str[i] == quote) 24 | { 25 | buf[j++] = '"'; 26 | i++; 27 | 28 | if (i >= str.length) 29 | break; 30 | 31 | if (str[i] == quote) 32 | continue; 33 | } 34 | buf[j++] = str[i]; 35 | } 36 | return buf[0 .. j]; 37 | } 38 | 39 | /** 40 | * Parse CSV data into an input range of records. 41 | * 42 | * Params: 43 | * fieldDelim = The field delimiter (default: ',') 44 | * quote = The quote character (default: '"') 45 | * input = The data in CSV format. 46 | * 47 | * Returns: 48 | * An input range of records, each of which is an array of fields. 49 | * 50 | * Bugs: 51 | * Does not do any validation on the input; will produce nonsensical results 52 | * if input is malformed. 53 | * 54 | * Cannot handle records with more than 4096 fields each. (This limit can be 55 | * statically increased by increasing fieldBlockSize.) 56 | */ 57 | auto csvByRecord(dchar fieldDelim=',', dchar quote='"')(const(char)[] input) 58 | { 59 | struct Result 60 | { 61 | private enum fieldBlockSize = 1 << 16; 62 | private const(char)[] data; 63 | private const(char)[][] fields; 64 | private size_t i, curField; 65 | 66 | bool empty = true; 67 | const(char)[][] front; 68 | 69 | this(const(char)[] input) 70 | { 71 | data = input; 72 | fields = new const(char)[][fieldBlockSize]; 73 | i = 0; 74 | curField = 0; 75 | empty = (input.length == 0); 76 | parseNextRecord(); 77 | } 78 | 79 | void parseNextRecord() 80 | { 81 | size_t firstField = curField; 82 | while (i < data.length && data[i] != '\n' && data[i] != '\r') 83 | { 84 | // Parse fields 85 | size_t firstChar, lastChar; 86 | bool hasDoubledQuotes = false; 87 | 88 | if (data[i] == quote) 89 | { 90 | import std.algorithm : max; 91 | 92 | i++; 93 | firstChar = i; 94 | while (i < data.length) 95 | { 96 | if (data[i] == quote) 97 | { 98 | i++; 99 | if (i >= data.length || data[i] != quote) 100 | break; 101 | 102 | hasDoubledQuotes = true; 103 | } 104 | i++; 105 | } 106 | assert(i-1 < data.length); 107 | lastChar = max(firstChar, i-1); 108 | } 109 | else 110 | { 111 | firstChar = i; 112 | while (i < data.length && data[i] != fieldDelim && 113 | data[i] != '\n' && data[i] != '\r') 114 | { 115 | i++; 116 | } 117 | lastChar = i; 118 | } 119 | if (curField >= fields.length) 120 | { 121 | // Fields block is full; copy current record fields into 122 | // new block so that they are contiguous. 123 | auto nextFields = new const(char)[][fieldBlockSize]; 124 | nextFields[0 .. curField - firstField] = 125 | fields[firstField .. curField]; 126 | 127 | //fields.length = firstField; // release unused memory? 128 | 129 | curField = curField - firstField; 130 | firstField = 0; 131 | fields = nextFields; 132 | } 133 | assert(curField < fields.length); 134 | if (hasDoubledQuotes) 135 | fields[curField++] = filterQuotes!quote( 136 | data[firstChar .. lastChar]); 137 | else 138 | fields[curField++] = data[firstChar .. lastChar]; 139 | 140 | // Skip over field delimiter 141 | if (i < data.length && data[i] == fieldDelim) 142 | i++; 143 | } 144 | 145 | front = fields[firstField .. curField]; 146 | 147 | // Skip over record delimiter(s) 148 | while (i < data.length && (data[i] == '\n' || data[i] == '\r')) 149 | i++; 150 | } 151 | 152 | void popFront() 153 | { 154 | if (i >= data.length) 155 | { 156 | empty = true; 157 | front = []; 158 | } 159 | else 160 | parseNextRecord(); 161 | } 162 | } 163 | return Result(input); 164 | } 165 | 166 | /** 167 | * Parses CSV string data into an array of records. 168 | * 169 | * Params: 170 | * fieldDelim = The field delimiter (default: ',') 171 | * quote = The quote character (default: '"') 172 | * input = The data in CSV format. 173 | * 174 | * Returns: 175 | * An array of records, each of which is an array of fields. 176 | */ 177 | auto csvToArray(dchar fieldDelim=',', dchar quote='"')(const(char)[] input) 178 | { 179 | import core.memory : GC; 180 | import std.array : array; 181 | 182 | GC.disable(); 183 | auto result = input.csvByRecord!(fieldDelim, quote).array; 184 | GC.collect(); 185 | GC.enable(); 186 | return result; 187 | } 188 | 189 | unittest 190 | { 191 | auto sampleData = 192 | `123,abc,"mno pqr",0` ~ "\n" ~ 193 | `456,def,"stuv wx",1` ~ "\n" ~ 194 | `78,ghijk,"yx",2`; 195 | 196 | auto parsed = csvToArray(sampleData); 197 | assert(parsed == [ 198 | [ "123", "abc", "mno pqr", "0" ], 199 | [ "456", "def", "stuv wx", "1" ], 200 | [ "78", "ghijk", "yx", "2" ] 201 | ]); 202 | } 203 | 204 | unittest 205 | { 206 | auto dosData = 207 | `123,aa,bb,cc` ~ "\r\n" ~ 208 | `456,dd,ee,ff` ~ "\r\n" ~ 209 | `789,gg,hh,ii` ~ "\r\n"; 210 | 211 | auto parsed = csvToArray(dosData); 212 | assert(parsed == [ 213 | [ "123", "aa", "bb", "cc" ], 214 | [ "456", "dd", "ee", "ff" ], 215 | [ "789", "gg", "hh", "ii" ] 216 | ]); 217 | } 218 | 219 | unittest 220 | { 221 | // Quoted fields that contains newlines and delimiters 222 | auto nastyData = 223 | `123,abc,"ha ha ` ~ "\n" ~ 224 | `ha this is a split value",567` ~ "\n" ~ 225 | `321,"a,comma,b",def,111` ~ "\n"; 226 | 227 | auto parsed = csvToArray(nastyData); 228 | assert(parsed == [ 229 | [ "123", "abc", "ha ha \nha this is a split value", "567" ], 230 | [ "321", "a,comma,b", "def", "111" ] 231 | ]); 232 | } 233 | 234 | unittest 235 | { 236 | // Quoted fields that contain quotes 237 | // (Note: RFC-4180 does not allow doubled quotes in unquoted fields) 238 | auto nastyData = 239 | `123,"a b ""haha"" c",456` ~ "\n"; 240 | 241 | auto parsed = csvToArray(nastyData); 242 | assert(parsed == [ 243 | [ "123", `a b "haha" c`, "456" ] 244 | ]); 245 | } 246 | 247 | // Boundary condition checks 248 | unittest 249 | { 250 | auto badData = `123,345,"def""`; 251 | auto parsed = csvToArray(badData); // should not crash 252 | 253 | auto moreBadData = `123,345,"a"`; 254 | parsed = csvToArray(moreBadData); // should not crash 255 | 256 | auto yetMoreBadData = `123,345,"`; 257 | parsed = csvToArray(yetMoreBadData); // should not crash 258 | 259 | auto emptyField = `123,,456`; 260 | parsed = csvToArray(emptyField); 261 | assert(parsed == [ [ "123", "", "456" ] ]); 262 | } 263 | 264 | static if (__VERSION__ < 2067UL) 265 | { 266 | // Copied from std.traits, to fill up lack in older versions of Phobos 267 | import std.typetuple : staticMap; 268 | private enum NameOf(alias T) = T.stringof; 269 | template isNested(T) 270 | if(is(T == class) || is(T == struct) || is(T == union)) 271 | { 272 | enum isNested = __traits(isNested, T); 273 | } 274 | template FieldNameTuple(T) 275 | { 276 | static if (is(T == struct) || is(T == union)) 277 | alias FieldNameTuple = staticMap!(NameOf, T.tupleof[0 .. $ - isNested!T]); 278 | else static if (is(T == class)) 279 | alias FieldNameTuple = staticMap!(NameOf, T.tupleof); 280 | else 281 | alias FieldNameTuple = TypeTuple!""; 282 | } 283 | } 284 | 285 | /** 286 | * Transcribe CSV data into an array of structs. 287 | * 288 | * Params: 289 | * S = The type of the struct each record must conform to. 290 | * fieldDelim = The field delimiter (default: ',') 291 | * quote = The quote character (default: '"') 292 | * input = The data in CSV format. 293 | * 294 | * Returns: 295 | * An array of S. 296 | * 297 | * Bugs: 298 | * Cannot handle strings larger than 64KB each. (This limit can be statically 299 | * raised by increasing stringBufSize.) 300 | */ 301 | auto csvByStruct(S, dchar fieldDelim=',', dchar quote='"')(const(char)[] input) 302 | if (is(S == struct)) 303 | { 304 | struct Result 305 | { 306 | private enum stringBufSize = 1 << 16; 307 | private const(char)[] data; 308 | private char[] stringBuf; 309 | private size_t i, curStringIdx; 310 | 311 | bool empty = true; 312 | S front; 313 | 314 | this(const(char)[] input) 315 | { 316 | data = input; 317 | stringBuf = new char[stringBufSize]; 318 | i = 0; 319 | curStringIdx = 0; 320 | 321 | if (input.length > 0) 322 | { 323 | empty = false; 324 | parseHeader(); 325 | if (input.length > 0) 326 | parseNextRecord(); 327 | } 328 | } 329 | 330 | const(char)[] parseField() pure 331 | { 332 | size_t firstChar, lastChar; 333 | bool hasDoubledQuotes = false; 334 | 335 | if (data[i] == quote) 336 | { 337 | import std.algorithm : max; 338 | 339 | i++; 340 | firstChar = i; 341 | while (i < data.length) 342 | { 343 | if (data[i] == quote) 344 | { 345 | i++; 346 | if (i >= data.length || data[i] != quote) 347 | break; 348 | 349 | hasDoubledQuotes = true; 350 | } 351 | i++; 352 | } 353 | assert(i-1 < data.length); 354 | lastChar = max(firstChar, i-1); 355 | } 356 | else 357 | { 358 | firstChar = i; 359 | while (i < data.length && data[i] != fieldDelim && 360 | data[i] != '\n' && data[i] != '\r') 361 | { 362 | i++; 363 | } 364 | lastChar = i; 365 | } 366 | return (hasDoubledQuotes) ? 367 | filterQuotes!quote(data[firstChar .. lastChar]) : 368 | data[firstChar .. lastChar]; 369 | } 370 | 371 | void parseHeader() 372 | { 373 | static if (__VERSION__ >= 2067UL) 374 | import std.traits : FieldNameTuple; 375 | 376 | assert(i < data.length); 377 | foreach (field; FieldNameTuple!S) 378 | { 379 | if (parseField() != field) 380 | throw new Exception( 381 | "CSV fields do not match struct fields"); 382 | 383 | // Skip over field delimiter 384 | if (i < data.length && data[i] == fieldDelim) 385 | i++; 386 | } 387 | 388 | if (i < data.length && data[i] != '\n' && data[i] != '\r') 389 | throw new Exception("CSV fields do not match struct fields"); 390 | 391 | // Skip over record delimiter(s) 392 | while (i < data.length && (data[i] == '\n' || data[i] == '\r')) 393 | i++; 394 | } 395 | 396 | void parseNextRecord() 397 | { 398 | import std.conv : to; 399 | static if (__VERSION__ >= 2067UL) 400 | import std.traits : FieldNameTuple; 401 | 402 | assert(i < data.length); 403 | foreach (field; FieldNameTuple!S) 404 | { 405 | alias Value = typeof(__traits(getMember, front, field)); 406 | 407 | // Convert value 408 | const(char)[] strval = parseField(); 409 | static if (is(Value == string)) 410 | { 411 | // Optimization for string fields: instead of many small 412 | // string allocations, consolidate strings into a string 413 | // buffer and take slices of it. 414 | if (strval.length + curStringIdx >= stringBuf.length) 415 | { 416 | // String buffer full; allocate new buffer. 417 | stringBuf = new char[stringBufSize]; 418 | curStringIdx = 0; 419 | } 420 | stringBuf[curStringIdx .. curStringIdx + strval.length] = 421 | strval[0 .. $]; 422 | 423 | // Since we never take overlapping slices of stringBuf, 424 | // it's safe to assume uniqueness here. 425 | import std.exception : assumeUnique; 426 | __traits(getMember, front, field) = assumeUnique(strval); 427 | } 428 | else 429 | __traits(getMember, front, field) = strval.to!Value; 430 | 431 | // Skip over field delimiter 432 | if (i < data.length && data[i] == fieldDelim) 433 | i++; 434 | } 435 | 436 | if (i < data.length && data[i] != '\n' && data[i] != '\r') 437 | throw new Exception("Record does not match struct"); 438 | 439 | // Skip over record delimiter(s) 440 | while (i < data.length && (data[i] == '\n' || data[i] == '\r')) 441 | i++; 442 | } 443 | 444 | void popFront() 445 | { 446 | if (i >= data.length) 447 | { 448 | empty = true; 449 | front = front.init; 450 | } 451 | else 452 | parseNextRecord(); 453 | } 454 | } 455 | return Result(input); 456 | } 457 | 458 | unittest 459 | { 460 | import std.algorithm.comparison : equal; 461 | 462 | struct S 463 | { 464 | string name; 465 | int year; 466 | int month; 467 | int day; 468 | } 469 | auto input = 470 | `name,year,month,day` ~"\n"~ 471 | `John Smith,1995,1,1` ~"\n"~ 472 | `Jane Doe,1996,2,14` ~"\n"~ 473 | `Albert Donahue,1997,3,30`; 474 | 475 | auto r = input.csvByStruct!S; 476 | assert(r.equal([ 477 | S("John Smith", 1995, 1, 1), 478 | S("Jane Doe", 1996, 2, 14), 479 | S("Albert Donahue", 1997, 3, 30) 480 | ])); 481 | 482 | // Test failure cases 483 | import std.exception : assertThrown; 484 | 485 | struct T 486 | { 487 | string name; 488 | int age; 489 | int customerId; 490 | } 491 | assertThrown(input.csvByStruct!T.front); 492 | 493 | auto badInput = 494 | `name,year,month,day` ~"\n"~ 495 | `1995,Jane Doe,2,14`; 496 | assertThrown(badInput.csvByStruct!S.front); 497 | } 498 | 499 | version(none) 500 | unittest 501 | { 502 | auto data = csvFromUtf8File("ext/cbp13co.txt"); 503 | import std.stdio; 504 | writefln("%d records", data.length); 505 | } 506 | 507 | // vim:set ai sw=4 ts=4 et: 508 | --------------------------------------------------------------------------------