├── .gitignore
├── resources
    ├── run.sh
    └── run_python.sh
├── Python
    ├── Dockerfile.python2
    ├── Dockerfile.python3
    ├── csv_test.py
    └── csv_test_3.py
├── .travis.yml
├── Go
    ├── Dockerfile
    └── csvtest.go
├── D
    ├── Dockerfile.ldc
    ├── Dockerfile.dmd
    ├── csv_test.d
    └── fastcsv.d
├── Nim
    ├── Dockerfile
    └── csv_test.nim
├── C
    ├── Dockerfile.gcc
    ├── Dockerfile.clang
    └── csv_test.c
├── README.md
└── Makefile


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 
3 | data/ngrams.tsv
4 | output/*


--------------------------------------------------------------------------------
/resources/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ( time /app/csv_test /data/ngrams.tsv 1 2 ) > /output/$1 2>&1


--------------------------------------------------------------------------------
/resources/run_python.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | ( time python /app/csv_test.py /data/ngrams.tsv 1 2 ) > /output/$1 2>&1


--------------------------------------------------------------------------------
/Python/Dockerfile.python2:
--------------------------------------------------------------------------------
 1 | # runtime image
 2 | 
 3 | FROM python:2-alpine
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csv_test.py .
 8 | 
 9 | VOLUME ['/data', '/output', '/resources']
10 | 
11 | RUN apk add --no-cache bash
12 | 
13 | CMD /resources/run_python.sh python2.txt


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | services:
 4 |   - docker
 5 | 
 6 | script:
 7 |   - make run
 8 | 
 9 | deploy:
10 |   provider: pages
11 |   skip-cleanup: true
12 |   github-token: $GITHUB_TOKEN
13 |   keep-history: true
14 |   local-dir: output
15 |   target-branch: gh-pages


--------------------------------------------------------------------------------
/Python/Dockerfile.python3:
--------------------------------------------------------------------------------
 1 | # runtime image
 2 | 
 3 | FROM python:3-alpine
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csv_test_3.py ./csv_test.py
 8 | 
 9 | VOLUME ['/data', '/output', '/resources']
10 | 
11 | RUN apk add --no-cache bash
12 | 
13 | CMD /resources/run_python.sh python3.txt


--------------------------------------------------------------------------------
/Go/Dockerfile:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM golang:alpine AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csvtest.go .
 8 | 
 9 | RUN go build -o csv_test ./csvtest.go
10 | 
11 | # runtime image
12 | 
13 | FROM alpine:latest
14 | 
15 | WORKDIR /app
16 | 
17 | COPY --from=build-env /app/csv_test .
18 | 
19 | VOLUME ['/data', '/output', '/resources']
20 | 
21 | RUN apk add --no-cache bash
22 | 
23 | CMD /resources/run.sh go.txt


--------------------------------------------------------------------------------
/D/Dockerfile.ldc:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM dlanguage/ldc AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY *.d ./
 8 | 
 9 | RUN ldc2 -of=./csv_test -O -release -boundscheck=off ./csv_test.d ./fastcsv.d
10 | 
11 | # runtime image
12 | 
13 | FROM ubuntu:16.04
14 | 
15 | WORKDIR /app
16 | 
17 | COPY --from=build-env /app/csv_test .
18 | 
19 | VOLUME ['/data', '/output', '/resources']
20 | 
21 | CMD /resources/run.sh d_ldc.txt
22 | 


--------------------------------------------------------------------------------
/D/Dockerfile.dmd:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM dlanguage/dmd AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY *.d ./
 8 | 
 9 | RUN dmd -O -release -inline -boundscheck=off -of=./csv_test ./csv_test.d ./fastcsv.d
10 | 
11 | # runtime image
12 | 
13 | FROM ubuntu:16.04
14 | 
15 | WORKDIR /app
16 | 
17 | COPY --from=build-env /app/csv_test .
18 | 
19 | VOLUME ['/data', '/output', '/resources']
20 | 
21 | CMD /resources/run.sh d_dmd.txt
22 | 


--------------------------------------------------------------------------------
/Nim/Dockerfile:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM nimlang/nim:latest-alpine AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csv_test.nim .
 8 | 
 9 | RUN nim c -d:release -o:csv_test csv_test.nim
10 | 
11 | # runtime image
12 | 
13 | FROM alpine:latest
14 | 
15 | WORKDIR /app
16 | 
17 | COPY --from=build-env /app/csv_test .
18 | 
19 | VOLUME ['/data', '/output', '/resources']
20 | 
21 | RUN apk add --no-cache bash
22 | 
23 | CMD /resources/run.sh nim.txt
24 | 


--------------------------------------------------------------------------------
/C/Dockerfile.gcc:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM alpine:latest AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csv_test.c .
 8 | 
 9 | RUN apk add --no-cache gcc musl-dev
10 | 
11 | RUN gcc -O3 -g -Wall -o csv_test csv_test.c
12 | 
13 | # runtime image
14 | 
15 | FROM alpine:latest
16 | 
17 | WORKDIR /app
18 | 
19 | COPY --from=build-env /app/csv_test .
20 | 
21 | VOLUME ['/data', '/output', '/resources']
22 | 
23 | RUN apk add --no-cache bash
24 | 
25 | CMD /resources/run.sh c_gcc.txt


--------------------------------------------------------------------------------
/C/Dockerfile.clang:
--------------------------------------------------------------------------------
 1 | # build image
 2 | 
 3 | FROM alpine:latest AS build-env
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY csv_test.c .
 8 | 
 9 | RUN apk add --no-cache clang musl-dev alpine-sdk
10 | 
11 | RUN clang -O3 -g -Wall -o csv_test csv_test.c
12 | 
13 | # runtime image
14 | 
15 | FROM alpine:latest
16 | 
17 | WORKDIR /app
18 | 
19 | COPY --from=build-env /app/csv_test .
20 | 
21 | VOLUME ['/data', '/output', '/resources']
22 | 
23 | RUN apk add --no-cache bash
24 | 
25 | CMD /resources/run.sh c_clang.txt


--------------------------------------------------------------------------------
/Python/csv_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import fileinput
 5 | import collections
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(description='Sum a column.')
 9 |     parser.add_argument('file', type=open)
10 |     parser.add_argument('key_field_index', type=int)
11 |     parser.add_argument('value_field_index', type=int)
12 | 
13 |     args = parser.parse_args()
14 |     delim = '\t'
15 | 
16 |     max_field_index = max(args.key_field_index, args.value_field_index)
17 |     sum_by_key = collections.Counter()
18 | 
19 |     for line in args.file:
20 |         fields = line.rstrip('\n').split(delim)
21 |         if max_field_index < len(fields):
22 |             sum_by_key[fields[args.key_field_index]] += int(fields[args.value_field_index])
23 | 
24 |     max_entry = sum_by_key.most_common(1);
25 |     if len(max_entry) == 0:
26 |         print 'No entries'
27 |     else:
28 |         print 'max_key:', max_entry[0][0], 'sum:', max_entry[0][1]
29 | 
30 | if __name__ == '__main__':
31 |     main()


--------------------------------------------------------------------------------
/Python/csv_test_3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import fileinput
 5 | import collections
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(description='Sum a column.')
 9 |     parser.add_argument('file', type=open)
10 |     parser.add_argument('key_field_index', type=int)
11 |     parser.add_argument('value_field_index', type=int)
12 | 
13 |     args = parser.parse_args()
14 |     delim = '\t'
15 | 
16 |     max_field_index = max(args.key_field_index, args.value_field_index)
17 |     sum_by_key = collections.Counter()
18 | 
19 |     for line in args.file:
20 |         fields = line.rstrip('\n').split(delim)
21 |         if max_field_index < len(fields):
22 |             sum_by_key[fields[args.key_field_index]] += int(fields[args.value_field_index])
23 | 
24 |     max_entry = sum_by_key.most_common(1);
25 |     if len(max_entry) == 0:
26 |         print('No entries')
27 |     else:
28 |         print('max_key:', max_entry[0][0], 'sum:', max_entry[0][1])
29 | 
30 | if __name__ == '__main__':
31 |     main()


--------------------------------------------------------------------------------
/D/csv_test.d:
--------------------------------------------------------------------------------
 1 | import std.algorithm, std.conv, std.file, std.stdio;
 2 | import fastcsv;
 3 | 
 4 | const delim = '\t';
 5 | 
 6 | int main(string[] args) {
 7 |     if (args.length < 4) {
 8 |         writeln("synopsis: ", args[0], " filename keyfield valuefield");
 9 |         return 1;
10 |     }
11 | 
12 |     const filename        = args[1],
13 |           keyFieldIndex   = args[2].to!size_t,
14 |           valueFieldIndex = args[3].to!size_t,
15 |           maxFieldIndex   = max(keyFieldIndex, valueFieldIndex);
16 |     const file = cast(string) read(filename);
17 |     long[string] sumByKey;
18 | 
19 |     foreach(record; file.csvByRecord!(delim)) {
20 |         if (record.length > maxFieldIndex)
21 |             sumByKey[record[keyFieldIndex]] += record[valueFieldIndex].to!long;
22 |     }
23 | 
24 |     if (sumByKey.length == 0) {
25 |         writeln("No entries");
26 |     }
27 |     else {
28 |         const maxEntry = sumByKey.byKeyValue.maxElement!"a.value";
29 |         writeln("max_key: ", maxEntry.key, " sum: ", maxEntry.value);
30 |     }
31 | 
32 |     return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/Nim/csv_test.nim:
--------------------------------------------------------------------------------
 1 | import os, strutils, streams, tables, parsecsv
 2 | 
 3 | const
 4 |   Delim = '\t'
 5 | 
 6 | proc main() =
 7 |   if paramCount() < 3:
 8 |     quit("synopsis: " & getAppFilename() & " filename keyfield valuefield")
 9 | 
10 |   let
11 |     filename = paramStr(1)
12 |     keyFieldIndex = parseInt(paramStr(2))
13 |     valueFieldIndex = parseInt(paramStr(3))
14 |     maxFieldIndex = max(keyFieldIndex, valueFieldIndex)
15 | 
16 |   var
17 |     sumByKey = newCountTable[string]()
18 |     file = newFileStream(filename, fmRead)
19 | 
20 |   if file == nil:
21 |     quit("cannot open the file " & filename)
22 | 
23 |   defer: file.close()
24 | 
25 |   var csv: CsvParser
26 |   open(csv, file, filename, separator=Delim)
27 | 
28 |   while csv.readRow():
29 |     if len(csv.row) > maxFieldIndex:
30 |       sumByKey.inc(csv.row[keyFieldIndex], parseInt(csv.row[valueFieldIndex]))
31 | 
32 |   if sumByKey.len() == 0:
33 |     echo "No entries"
34 |   else:
35 |     let largest = sumByKey.largest()
36 |     echo "max_key: ", largest[0], " sum: ", largest[1]
37 | 
38 | main()
39 | 


--------------------------------------------------------------------------------
/Go/csvtest.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"fmt"
 6 | 	"io"
 7 | 	"math"
 8 | 	"os"
 9 | 	"strconv"
10 | 	"strings"
11 | )
12 | 
13 | func maxEntry(dict map[int]int) (key, value int) {
14 | 	mk := 0
15 | 	mv := 0
16 | 
17 | 	for k, v := range dict {
18 | 		if mv < v {
19 | 			mv = v
20 | 			mk = k
21 | 		}
22 | 	}
23 | 	return mk, mv
24 | }
25 | 
26 | func main() {
27 | 	sumByKey := make(map[int]int)
28 | 	delim := "\t"
29 | 
30 | 	if len(os.Args) < 3 {
31 | 		fmt.Println("synopsis: csvtest filename keyfield valuefield")
32 | 		os.Exit(1)
33 | 	}
34 | 
35 | 	filename := os.Args[1]
36 | 	keyFieldIndex, _ := strconv.Atoi(os.Args[2])
37 | 	valueFieldIndex, _ := strconv.Atoi(os.Args[3])
38 | 	maxFieldIndex := int(math.Max(float64(keyFieldIndex),
39 | 		float64(valueFieldIndex)))
40 | 	file, err := os.Open(filename)
41 | 	if err != nil {
42 | 		fmt.Println(err)
43 | 		os.Exit(1)
44 | 	}
45 | 
46 | 	reader := bufio.NewReader(file)
47 | 
48 | 	for {
49 | 		line, err := reader.ReadString('\n')
50 | 		if err == io.EOF {
51 | 			break
52 | 		}
53 | 		record := strings.Split(line, delim)
54 | 		if maxFieldIndex < len(record) {
55 | 			value, _ := strconv.Atoi(record[valueFieldIndex])
56 | 			key, _ := strconv.Atoi(record[keyFieldIndex])
57 | 			sumByKey[key] += value
58 | 		}
59 | 	}
60 | 
61 | 	if len(sumByKey) == 0 {
62 | 		fmt.Println("No entries")
63 | 	} else {
64 | 		maxKey, maxValue := maxEntry(sumByKey)
65 | 		fmt.Println("max_key:", maxKey, "sum:", maxValue)
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Faster Command Line Tools in Nim
 2 | 
 3 | This is a project to test how Nim compares to D in one very specific scenario. It was inspired by the [`Faster Command line Tools in D` blog post](http://dlang.org/blog/2017/05/24/faster-command-line-tools-in-d/).
 4 | 
 5 | There's a full blog post explaining the reasoning and with some basic results form my system [available here](https://www.euantorano.co.uk/posts/faster-command-line-tools-in-nim/).
 6 | 
 7 | ## Running the tests
 8 | 
 9 | All versions are built and ran using Docker.
10 | 
11 | To run every version and dump results into the `output` folder, you can use the `run` Make target:
12 | 
13 | ```
14 | make run
15 | ```
16 | 
17 | You can also run individual targets if you're working on improving a target:
18 | 
19 | - **C**: `make c_run`
20 | - **D**: `make d_run`
21 | - **Go**: `make go_run`
22 | - **Nim**: `make nim_run`
23 | - **Python**: `make python_run`
24 | 
25 | This will download the `resources/ngrams.tsv` if it doesn't already exist
26 | 
27 | ## Results
28 | 
29 | This repository is built by Travis for every push or PR. Results are published to the `gh-pages` branch:
30 | 
31 | - [C (GCC)](https://euantorano.github.io/faster-command-line-tools-in-nim/c_gcc.txt)
32 | - [C (clang)](https://euantorano.github.io/faster-command-line-tools-in-nim/c_clang.txt)
33 | - [D (DMD)](https://euantorano.github.io/faster-command-line-tools-in-nim/d_dmd.txt)
34 | - [D (LDC)](https://euantorano.github.io/faster-command-line-tools-in-nim/d_ldc.txt)
35 | - [Go](https://euantorano.github.io/faster-command-line-tools-in-nim/go.txt)
36 | - [Nim](https://euantorano.github.io/faster-command-line-tools-in-nim/nim.txt)
37 | - [Python 2](https://euantorano.github.io/faster-command-line-tools-in-nim/python2.txt)
38 | - [Python 3](https://euantorano.github.io/faster-command-line-tools-in-nim/python3.txt)
39 | 
40 | ## TODO
41 | 
42 | - [ ] Build an overall results file which will be published to GitHub pages - possibly include graphs in this file?
43 | - [ ] Run each version multiple times and take the average run times
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CURRENT_DIR=$(shell pwd)
 2 | 
 3 | c_image:
 4 | 	@docker build -f ./C/Dockerfile.gcc -t csv_test_c_gcc ./C/
 5 | 	@docker build -f ./C/Dockerfile.clang -t csv_test_c_clang ./C/
 6 | 
 7 | d_image:
 8 | 	@docker build -f ./D/Dockerfile.dmd -t csv_test_d_dmd ./D/
 9 | 	@docker build -f ./D/Dockerfile.ldc -t csv_test_d_ldc ./D/
10 | 
11 | go_image:
12 | 	@docker build -t csv_test_go ./Go/
13 | 
14 | nim_image:
15 | 	@docker build -t csv_test_nim ./Nim/
16 | 
17 | python_image:
18 | 	@docker build -f ./Python/Dockerfile.python2 -t csv_test_python2 ./Python/
19 | 	@docker build -f ./Python/Dockerfile.python3 -t csv_test_python3 ./Python/
20 | 
21 | build: c_image d_image go_image nim_image python_image
22 | 
23 | data/ngrams.tsv:
24 | 	@mkdir -p data
25 | 	@curl --output ./data/ngrams.gz https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-0.gz
26 | 	@gunzip ./data/ngrams.gz
27 | 	@mv ./data/ngrams  ./data/ngrams.tsv
28 | 
29 | c_run: c_image data/ngrams.tsv
30 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_c_gcc
31 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_c_clang
32 | 
33 | d_run: d_image data/ngrams.tsv
34 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_d_dmd
35 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_d_ldc
36 | 
37 | go_run: go_image data/ngrams.tsv
38 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_go
39 | 
40 | nim_run: nim_image data/ngrams.tsv
41 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_nim
42 | 
43 | python_run: python_image data/ngrams.tsv
44 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_python2
45 | 	@docker run --rm -v $(CURRENT_DIR)/data:/data:ro -v $(CURRENT_DIR)/output:/output -v $(CURRENT_DIR)/resources:/resources:ro csv_test_python3
46 | 
47 | run: c_run d_run go_run nim_run python_run
48 | 
49 | clean:
50 | 	@rm -Rf data
51 | 	@rm -Rf output
52 | 	@docker image rm csv_test_c_gcc csv_test_c_clang csv_test_d_dmd csv_test_d_ldc csv_test_go csv_test_nim csv_test_python2 csv_test_python3 2>/dev/null; true
53 | 
54 | .PHONY: c_image d_image go_image nim_image build clean


--------------------------------------------------------------------------------
/C/csv_test.c:
--------------------------------------------------------------------------------
  1 | #include <assert.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <inttypes.h>
  6 | 
  7 | #include <sys/types.h>
  8 | #include <sys/stat.h>
  9 | #include <sys/mman.h>
 10 | #include <unistd.h>
 11 | #include <fcntl.h>
 12 | #include <pthread.h>
 13 | 
 14 | enum {
 15 | 	NWorkers = 14,
 16 | 	NBucket = 2048,
 17 | };
 18 | 
 19 | typedef struct Bucket Bucket;
 20 | typedef struct Worker Worker;
 21 | 
 22 | struct Bucket {
 23 | 	char **key;
 24 | 	int *count;
 25 | 	size_t size;
 26 | };
 27 | 
 28 | struct Worker {
 29 | 	pthread_t thread;
 30 | 	size_t beg, end;
 31 | 	Bucket hash[NBucket];
 32 | };
 33 | 
 34 | Worker w[NWorkers];
 35 | char *data;
 36 | 
 37 | void
 38 | die(char *msg)
 39 | {
 40 | 	fputs(msg, stderr);
 41 | 	exit(1);
 42 | }
 43 | 
 44 | int
 45 | addcount(Bucket *b, char *key, int count)
 46 | {
 47 | 	size_t n;
 48 | 
 49 | 	for (n=0; n<b->size; n++)
 50 | 		if (strcmp(key, b->key[n]) == 0)
 51 | 			return b->count[n] += count;
 52 | 	b->size++;
 53 | 	b->key = realloc(b->key, b->size);
 54 | 	b->count = realloc(b->count, b->size);
 55 | 	b->key[n] = key;
 56 | 	return b->count[n] = count;
 57 | }
 58 | 
 59 | void *
 60 | worker(void *p)
 61 | {
 62 | 	char *key, *cur, *endp;
 63 | 	size_t beg, end;
 64 | 	int count;
 65 | 	uint32_t h;
 66 | 	Bucket *hash;
 67 | 	Worker *pw;
 68 | 
 69 | 	pw = p;
 70 | 	hash = pw->hash;
 71 | 	beg = pw->beg;
 72 | 	end = pw->end;
 73 | 
 74 | 	cur = &data[beg];
 75 | 	endp = &data[end];
 76 | 
 77 | 	do {
 78 | 		while (*cur != '\t')
 79 | 			cur++;
 80 | 		key = ++cur;
 81 | 		h = 0;
 82 | 		while (*cur != '\t') {
 83 | 			h = h*33 + *cur;
 84 | 			cur++;
 85 | 		}
 86 | 		*cur++ = 0;
 87 | 		count = 0;
 88 | 		while (*cur != '\t') {
 89 | 			count = count * 10 + (*cur - '0');
 90 | 			cur++;
 91 | 		}
 92 | 		addcount(&hash[h & (NBucket-1)], key, count);
 93 | 		cur = memchr(cur, '\n', endp-cur);
 94 | 	} while (cur && ++cur != endp);
 95 | 
 96 | 	// printf("Done %zd - %zd\n", beg, end);
 97 | 	return 0;
 98 | }
 99 | 
100 | int
101 | main(int ac, char *av[])
102 | {
103 | 	struct stat s;
104 | 	int fd, max, cnt;
105 | 	size_t cur, chunk, fsz, n, i;
106 | 	char *nl, *maxk;
107 | 	Worker *pw;
108 | 	Bucket *b;
109 | 
110 | 	if (ac < 2)
111 | 		die("no argument\n");
112 | 
113 | 	fd = open(av[1], O_RDONLY);
114 | 	if (fd == -1)
115 | 		die("cannot open file\n");
116 | 	if (fstat(fd, &s))
117 | 		die("cannot stat file\n");
118 | 	fsz = s.st_size;
119 | 	data = mmap(0, fsz, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
120 | 	if (data == MAP_FAILED)
121 | 		die("cannot mmap file\n");
122 | 
123 | 	/* start workers */
124 | 	chunk = fsz / NWorkers;
125 | 	cur = 0;
126 | 	for (pw=w; pw<&w[NWorkers]; pw++) {
127 | 		pw->beg = cur;
128 | 		cur += chunk;
129 | 		if (cur > fsz)
130 | 			cur = fsz;
131 | 		nl = memchr(&data[cur-1], '\n', fsz-cur+1);
132 | 		if (nl)
133 | 			cur = nl - data + 1;
134 | 		pw->end = cur;
135 | 		pthread_create(&pw->thread, 0, worker, pw);
136 | 	}
137 | 	assert(cur == fsz);
138 | 
139 | 	/* wait for all threads to be done */
140 | 	for (pw=w; pw<&w[NWorkers]; pw++)
141 | 		pthread_join(pw->thread, 0);
142 | 
143 | 	max = 0;
144 | 	maxk = "oops";
145 | 	/* aggregate results */
146 | 	for (pw=&w[1]; pw<&w[NWorkers]; pw++) {
147 | 		for (n=0; n<NBucket; n++) {
148 | 			b = &pw->hash[n];
149 | 			for (i=0; i<b->size; i++) {
150 | 				cnt = addcount(&w[0].hash[n], b->key[i], b->count[i]);
151 | 				if (cnt > max) {
152 | 					max = cnt;
153 | 					maxk = b->key[i];
154 | 				}
155 | 			}
156 | 		}
157 | 	}
158 | 
159 | 	printf("max_key: %s sum: %d\n", maxk, max);
160 | 	return 0;
161 | }
162 | 


--------------------------------------------------------------------------------
/D/fastcsv.d:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Experimental fast CSV reader.
  3 |  *
  4 |  * Based on RFC 4180.
  5 |  */
  6 | module fastcsv;
  7 | 
  8 | /**
  9 |  * Reads CSV data from the given filename.
 10 |  */
 11 | auto csvFromUtf8File(string filename)
 12 | {
 13 |     import std.file : read;
 14 |     return csvToArray(cast(string) read(filename));
 15 | }
 16 | 
 17 | private char[] filterQuotes(dchar quote)(const(char)[] str) pure
 18 | {
 19 |     auto buf = new char[str.length];
 20 |     size_t j = 0;
 21 |     for (size_t i = 0; i < str.length; i++)
 22 |     {
 23 |         if (str[i] == quote)
 24 |         {
 25 |             buf[j++] = '"';
 26 |             i++;
 27 | 
 28 |             if (i >= str.length)
 29 |                 break;
 30 | 
 31 |             if (str[i] == quote)
 32 |                 continue;
 33 |         }
 34 |         buf[j++] = str[i];
 35 |     }
 36 |     return buf[0 .. j];
 37 | }
 38 | 
 39 | /**
 40 |  * Parse CSV data into an input range of records.
 41 |  *
 42 |  * Params:
 43 |  *  fieldDelim = The field delimiter (default: ',')
 44 |  *  quote = The quote character (default: '"')
 45 |  *  input = The data in CSV format.
 46 |  *
 47 |  * Returns:
 48 |  *  An input range of records, each of which is an array of fields.
 49 |  *
 50 |  * Bugs:
 51 |  *  Does not do any validation on the input; will produce nonsensical results
 52 |  *  if input is malformed.
 53 |  *
 54 |  *  Cannot handle records with more than 4096 fields each. (This limit can be
 55 |  *  statically increased by increasing fieldBlockSize.)
 56 |  */
 57 | auto csvByRecord(dchar fieldDelim=',', dchar quote='"')(const(char)[] input)
 58 | {
 59 |     struct Result
 60 |     {
 61 |         private enum fieldBlockSize = 1 << 16;
 62 |         private const(char)[] data;
 63 |         private const(char)[][] fields;
 64 |         private size_t i, curField;
 65 | 
 66 |         bool empty = true;
 67 |         const(char)[][] front;
 68 | 
 69 |         this(const(char)[] input)
 70 |         {
 71 |             data = input;
 72 |             fields = new const(char)[][fieldBlockSize];
 73 |             i = 0;
 74 |             curField = 0;
 75 |             empty = (input.length == 0);
 76 |             parseNextRecord();
 77 |         }
 78 | 
 79 |         void parseNextRecord()
 80 |         {
 81 |             size_t firstField = curField;
 82 |             while (i < data.length && data[i] != '\n' && data[i] != '\r')
 83 |             {
 84 |                 // Parse fields
 85 |                 size_t firstChar, lastChar;
 86 |                 bool hasDoubledQuotes = false;
 87 | 
 88 |                 if (data[i] == quote)
 89 |                 {
 90 |                     import std.algorithm : max;
 91 | 
 92 |                     i++;
 93 |                     firstChar = i;
 94 |                     while (i < data.length)
 95 |                     {
 96 |                         if (data[i] == quote)
 97 |                         {
 98 |                             i++;
 99 |                             if (i >= data.length || data[i] != quote)
100 |                                 break;
101 | 
102 |                             hasDoubledQuotes = true;
103 |                         }
104 |                         i++;
105 |                     }
106 |                     assert(i-1 < data.length);
107 |                     lastChar = max(firstChar, i-1);
108 |                 }
109 |                 else
110 |                 {
111 |                     firstChar = i;
112 |                     while (i < data.length && data[i] != fieldDelim &&
113 |                            data[i] != '\n' && data[i] != '\r')
114 |                     {
115 |                         i++;
116 |                     }
117 |                     lastChar = i;
118 |                 }
119 |                 if (curField >= fields.length)
120 |                 {
121 |                     // Fields block is full; copy current record fields into
122 |                     // new block so that they are contiguous.
123 |                     auto nextFields = new const(char)[][fieldBlockSize];
124 |                     nextFields[0 .. curField - firstField] =
125 |                         fields[firstField .. curField];
126 | 
127 |                     //fields.length = firstField; // release unused memory?
128 | 
129 |                     curField = curField - firstField;
130 |                     firstField = 0;
131 |                     fields = nextFields;
132 |                 }
133 |                 assert(curField < fields.length);
134 |                 if (hasDoubledQuotes)
135 |                     fields[curField++] = filterQuotes!quote(
136 |                                             data[firstChar .. lastChar]);
137 |                 else
138 |                     fields[curField++] = data[firstChar .. lastChar];
139 | 
140 |                 // Skip over field delimiter
141 |                 if (i < data.length && data[i] == fieldDelim)
142 |                     i++;
143 |             }
144 | 
145 |             front = fields[firstField .. curField];
146 | 
147 |             // Skip over record delimiter(s)
148 |             while (i < data.length && (data[i] == '\n' || data[i] == '\r'))
149 |                 i++;
150 |         }
151 | 
152 |         void popFront()
153 |         {
154 |             if (i >= data.length)
155 |             {
156 |                 empty = true;
157 |                 front = [];
158 |             }
159 |             else
160 |                 parseNextRecord();
161 |         }
162 |     }
163 |     return Result(input);
164 | }
165 | 
166 | /**
167 |  * Parses CSV string data into an array of records.
168 |  *
169 |  * Params:
170 |  *  fieldDelim = The field delimiter (default: ',')
171 |  *  quote = The quote character (default: '"')
172 |  *  input = The data in CSV format.
173 |  *
174 |  * Returns:
175 |  *  An array of records, each of which is an array of fields.
176 |  */
177 | auto csvToArray(dchar fieldDelim=',', dchar quote='"')(const(char)[] input)
178 | {
179 |     import core.memory : GC;
180 |     import std.array : array;
181 | 
182 |     GC.disable();
183 |     auto result = input.csvByRecord!(fieldDelim, quote).array;
184 |     GC.collect();
185 |     GC.enable();
186 |     return result;
187 | }
188 | 
189 | unittest
190 | {
191 |     auto sampleData =
192 |         `123,abc,"mno pqr",0` ~ "\n" ~
193 |         `456,def,"stuv wx",1` ~ "\n" ~
194 |         `78,ghijk,"yx",2`;
195 | 
196 |     auto parsed = csvToArray(sampleData);
197 |     assert(parsed == [
198 |         [ "123", "abc", "mno pqr", "0" ],
199 |         [ "456", "def", "stuv wx", "1" ],
200 |         [ "78", "ghijk", "yx", "2" ]
201 |     ]);
202 | }
203 | 
204 | unittest
205 | {
206 |     auto dosData =
207 |         `123,aa,bb,cc` ~ "\r\n" ~
208 |         `456,dd,ee,ff` ~ "\r\n" ~
209 |         `789,gg,hh,ii` ~ "\r\n";
210 | 
211 |     auto parsed = csvToArray(dosData);
212 |     assert(parsed == [
213 |         [ "123", "aa", "bb", "cc" ],
214 |         [ "456", "dd", "ee", "ff" ],
215 |         [ "789", "gg", "hh", "ii" ]
216 |     ]);
217 | }
218 | 
219 | unittest
220 | {
221 |     // Quoted fields that contains newlines and delimiters
222 |     auto nastyData =
223 |         `123,abc,"ha ha ` ~ "\n" ~
224 |         `ha this is a split value",567` ~ "\n" ~
225 |         `321,"a,comma,b",def,111` ~ "\n";
226 | 
227 |     auto parsed = csvToArray(nastyData);
228 |     assert(parsed == [
229 |         [ "123", "abc", "ha ha \nha this is a split value", "567" ],
230 |         [ "321", "a,comma,b", "def", "111" ]
231 |     ]);
232 | }
233 | 
234 | unittest
235 | {
236 |     // Quoted fields that contain quotes
237 |     // (Note: RFC-4180 does not allow doubled quotes in unquoted fields)
238 |     auto nastyData =
239 |         `123,"a b ""haha"" c",456` ~ "\n";
240 | 
241 |     auto parsed = csvToArray(nastyData);
242 |     assert(parsed == [
243 |         [ "123", `a b "haha" c`, "456" ]
244 |     ]);
245 | }
246 | 
247 | // Boundary condition checks
248 | unittest
249 | {
250 |     auto badData = `123,345,"def""`;
251 |     auto parsed = csvToArray(badData);   // should not crash
252 | 
253 |     auto moreBadData = `123,345,"a"`;
254 |     parsed = csvToArray(moreBadData);    // should not crash
255 | 
256 |     auto yetMoreBadData = `123,345,"`;
257 |     parsed = csvToArray(yetMoreBadData); // should not crash
258 | 
259 |     auto emptyField = `123,,456`;
260 |     parsed = csvToArray(emptyField);
261 |     assert(parsed == [ [ "123", "", "456" ] ]);
262 | }
263 | 
264 | static if (__VERSION__ < 2067UL)
265 | {
266 |     // Copied from std.traits, to fill up lack in older versions of Phobos
267 |     import std.typetuple : staticMap;
268 |     private enum NameOf(alias T) = T.stringof;
269 |     template isNested(T)
270 |         if(is(T == class) || is(T == struct) || is(T == union))
271 |     {
272 |         enum isNested = __traits(isNested, T);
273 |     }
274 |     template FieldNameTuple(T)
275 |     {
276 |         static if (is(T == struct) || is(T == union))
277 |             alias FieldNameTuple = staticMap!(NameOf, T.tupleof[0 .. $ - isNested!T]);
278 |         else static if (is(T == class))
279 |             alias FieldNameTuple = staticMap!(NameOf, T.tupleof);
280 |         else
281 |             alias FieldNameTuple = TypeTuple!"";
282 |     }
283 | }
284 | 
285 | /**
286 |  * Transcribe CSV data into an array of structs.
287 |  *
288 |  * Params:
289 |  *  S = The type of the struct each record must conform to.
290 |  *  fieldDelim = The field delimiter (default: ',')
291 |  *  quote = The quote character (default: '"')
292 |  *  input = The data in CSV format.
293 |  *
294 |  * Returns:
295 |  *  An array of S.
296 |  *
297 |  * Bugs:
298 |  *  Cannot handle strings larger than 64KB each. (This limit can be statically
299 |  *  raised by increasing stringBufSize.)
300 |  */
301 | auto csvByStruct(S, dchar fieldDelim=',', dchar quote='"')(const(char)[] input)
302 |     if (is(S == struct))
303 | {
304 |     struct Result
305 |     {
306 |         private enum stringBufSize = 1 << 16;
307 |         private const(char)[] data;
308 |         private char[] stringBuf;
309 |         private size_t i, curStringIdx;
310 | 
311 |         bool empty = true;
312 |         S front;
313 | 
314 |         this(const(char)[] input)
315 |         {
316 |             data = input;
317 |             stringBuf = new char[stringBufSize];
318 |             i = 0;
319 |             curStringIdx = 0;
320 | 
321 |             if (input.length > 0)
322 |             {
323 |                 empty = false;
324 |                 parseHeader();
325 |                 if (input.length > 0)
326 |                     parseNextRecord();
327 |             }
328 |         }
329 | 
330 |         const(char)[] parseField() pure
331 |         {
332 |             size_t firstChar, lastChar;
333 |             bool hasDoubledQuotes = false;
334 | 
335 |             if (data[i] == quote)
336 |             {
337 |                 import std.algorithm : max;
338 | 
339 |                 i++;
340 |                 firstChar = i;
341 |                 while (i < data.length)
342 |                 {
343 |                     if (data[i] == quote)
344 |                     {
345 |                         i++;
346 |                         if (i >= data.length || data[i] != quote)
347 |                             break;
348 | 
349 |                         hasDoubledQuotes = true;
350 |                     }
351 |                     i++;
352 |                 }
353 |                 assert(i-1 < data.length);
354 |                 lastChar = max(firstChar, i-1);
355 |             }
356 |             else
357 |             {
358 |                 firstChar = i;
359 |                 while (i < data.length && data[i] != fieldDelim &&
360 |                        data[i] != '\n' && data[i] != '\r')
361 |                 {
362 |                     i++;
363 |                 }
364 |                 lastChar = i;
365 |             }
366 |             return (hasDoubledQuotes) ?
367 |                 filterQuotes!quote(data[firstChar .. lastChar]) :
368 |                 data[firstChar .. lastChar];
369 |         }
370 | 
371 |         void parseHeader()
372 |         {
373 |             static if (__VERSION__ >= 2067UL)
374 |                 import std.traits : FieldNameTuple;
375 | 
376 |             assert(i < data.length);
377 |             foreach (field; FieldNameTuple!S)
378 |             {
379 |                 if (parseField() != field)
380 |                     throw new Exception(
381 |                         "CSV fields do not match struct fields");
382 | 
383 |                 // Skip over field delimiter
384 |                 if (i < data.length && data[i] == fieldDelim)
385 |                     i++;
386 |             }
387 | 
388 |             if (i < data.length && data[i] != '\n' && data[i] != '\r')
389 |                 throw new Exception("CSV fields do not match struct fields");
390 | 
391 |             // Skip over record delimiter(s)
392 |             while (i < data.length && (data[i] == '\n' || data[i] == '\r'))
393 |                 i++;
394 |         }
395 | 
396 |         void parseNextRecord()
397 |         {
398 |             import std.conv : to;
399 |             static if (__VERSION__ >= 2067UL)
400 |                 import std.traits : FieldNameTuple;
401 | 
402 |             assert(i < data.length);
403 |             foreach (field; FieldNameTuple!S)
404 |             {
405 |                 alias Value = typeof(__traits(getMember, front, field));
406 | 
407 |                 // Convert value
408 |                 const(char)[] strval = parseField();
409 |                 static if (is(Value == string))
410 |                 {
411 |                     // Optimization for string fields: instead of many small
412 |                     // string allocations, consolidate strings into a string
413 |                     // buffer and take slices of it.
414 |                     if (strval.length + curStringIdx >= stringBuf.length)
415 |                     {
416 |                         // String buffer full; allocate new buffer.
417 |                         stringBuf = new char[stringBufSize];
418 |                         curStringIdx = 0;
419 |                     }
420 |                     stringBuf[curStringIdx .. curStringIdx + strval.length] = 
421 |                         strval[0 .. $];
422 | 
423 |                     // Since we never take overlapping slices of stringBuf,
424 |                     // it's safe to assume uniqueness here.
425 |                     import std.exception : assumeUnique;
426 |                     __traits(getMember, front, field) = assumeUnique(strval);
427 |                 }
428 |                 else
429 |                     __traits(getMember, front, field) = strval.to!Value;
430 | 
431 |                 // Skip over field delimiter
432 |                 if (i < data.length && data[i] == fieldDelim)
433 |                     i++;
434 |             }
435 | 
436 |             if (i < data.length && data[i] != '\n' && data[i] != '\r')
437 |                 throw new Exception("Record does not match struct");
438 | 
439 |             // Skip over record delimiter(s)
440 |             while (i < data.length && (data[i] == '\n' || data[i] == '\r'))
441 |                 i++;
442 |         }
443 | 
444 |         void popFront()
445 |         {
446 |             if (i >= data.length)
447 |             {
448 |                 empty = true;
449 |                 front = front.init;
450 |             }
451 |             else
452 |                 parseNextRecord();
453 |         }
454 |     }
455 |     return Result(input);
456 | }
457 | 
458 | unittest
459 | {
460 |     import std.algorithm.comparison : equal;
461 | 
462 |     struct S
463 |     {
464 |         string name;
465 |         int year;
466 |         int month;
467 |         int day;
468 |     }
469 |     auto input =
470 |         `name,year,month,day` ~"\n"~
471 |         `John Smith,1995,1,1` ~"\n"~
472 |         `Jane Doe,1996,2,14` ~"\n"~
473 |         `Albert Donahue,1997,3,30`;
474 | 
475 |     auto r = input.csvByStruct!S;
476 |     assert(r.equal([
477 |         S("John Smith", 1995, 1, 1),
478 |         S("Jane Doe", 1996, 2, 14),
479 |         S("Albert Donahue", 1997, 3, 30)
480 |     ]));
481 | 
482 |     // Test failure cases
483 |     import std.exception : assertThrown;
484 | 
485 |     struct T
486 |     {
487 |         string name;
488 |         int age;
489 |         int customerId;
490 |     }
491 |     assertThrown(input.csvByStruct!T.front);
492 | 
493 |     auto badInput =
494 |         `name,year,month,day` ~"\n"~
495 |         `1995,Jane Doe,2,14`;
496 |     assertThrown(badInput.csvByStruct!S.front);
497 | }
498 | 
499 | version(none)
500 | unittest
501 | {
502 |     auto data = csvFromUtf8File("ext/cbp13co.txt");
503 |     import std.stdio;
504 |     writefln("%d records", data.length);
505 | }
506 | 
507 | // vim:set ai sw=4 ts=4 et:
508 | 


--------------------------------------------------------------------------------