├── .gitignore ├── Makefile ├── README.md ├── simple-getchar.c ├── simple-read.c └── threads.c /.gitignore: -------------------------------------------------------------------------------- 1 | simple-getchar 2 | simple-read 3 | threads 4 | data 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS = -std=c99 -D_POSIX_C_SOURCE=200809L -O3 -Wall -Wextra -Wpedantic -Wshadow -Werror 2 | 3 | all : simple-getchar simple-read threads 4 | 5 | threads: threads.c 6 | cc $(CFLAGS) -o $@ $? -pthread 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Beating haskell with C 2 | 3 | An attemp to measure a simple, a multithreaded, and a unicode-aware version of 4 | wc written in C compared with the optimized Haskell implementation in 5 | [ChrisPenner/wc](https://github.com/ChrisPenner/wc). 6 | 7 | Unicode version is forthcoming, but will be a cleaned up version of 8 | https://begriffs.com/posts/2019-05-23-unicode-icu.html#counting-words-and-graphemes 9 | 10 | ### To run 11 | 12 | The fastest version is in threads.c. 13 | 14 | ```sh 15 | make 16 | ./threads N /path/to/file 17 | 18 | # where N is the number of threads to use 19 | ``` 20 | -------------------------------------------------------------------------------- /simple-getchar.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(void) 6 | { 7 | unsigned long long chars=0, words=0, lines=0; 8 | int c; 9 | bool gotspace = true; 10 | 11 | while((c = getchar()) != EOF) 12 | { 13 | chars++; 14 | if (isspace(c)) 15 | { 16 | gotspace = true; 17 | if (c == '\n') 18 | lines++; 19 | } 20 | else if(gotspace) 21 | { 22 | gotspace = false; 23 | words++; 24 | } 25 | } 26 | printf("%llu %llu %llu\n", lines, words, chars); 27 | } 28 | -------------------------------------------------------------------------------- /simple-read.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #define BUFSZ 1024*1024 9 | 10 | int main(void) 11 | { 12 | unsigned long long chars=0, words=0, lines=0; 13 | bool gotspace = true; 14 | char *buf = malloc(BUFSZ), c; 15 | ssize_t index, length; 16 | 17 | while ((length = read(STDIN_FILENO, buf, BUFSZ)) > 0) 18 | { 19 | for (index = 0; index < length; ++index) 20 | { 21 | chars++; 22 | if (isspace(c = buf[index])) 23 | { 24 | gotspace = true; 25 | if (c == '\n') 26 | lines++; 27 | } 28 | else if(gotspace) 29 | { 30 | gotspace = false; 31 | words++; 32 | } 33 | } 34 | } 35 | free(buf); 36 | printf("%llu %llu %llu\n", lines, words, chars); 37 | } 38 | -------------------------------------------------------------------------------- /threads.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | struct counts 13 | { 14 | long chars; 15 | long words; 16 | long lines; 17 | 18 | char cfirst; 19 | char clast; 20 | }; 21 | 22 | struct chunk 23 | { 24 | int fd; 25 | off_t offset; 26 | size_t length; 27 | }; 28 | 29 | #define MIN(x,y) ((x) < (y) ? (x) : (y)) 30 | #define BUFSZ 1024*1024 31 | 32 | struct counts *count_chunk(struct chunk *chunk) 33 | { 34 | long chars=0, words=0, lines=0; 35 | struct counts *ret = calloc(1, sizeof(struct counts)); 36 | bool gotspace = true; 37 | char c = '\0', *buf = malloc(BUFSZ); 38 | ssize_t index, n, remaining = chunk->length; 39 | 40 | lseek(chunk->fd, chunk->offset, SEEK_SET); 41 | while ((n = read(chunk->fd, buf, MIN(BUFSZ, remaining))) > 0) 42 | { 43 | if (!ret->cfirst) 44 | ret->cfirst = buf[0]; 45 | for (index = 0; index < n; ++index) 46 | { 47 | chars++; 48 | if (isspace(c = buf[index])) 49 | { 50 | gotspace = true; 51 | if (c == '\n') 52 | lines++; 53 | } 54 | else if(gotspace) 55 | { 56 | gotspace = false; 57 | words++; 58 | } 59 | } 60 | remaining -= n; 61 | } 62 | free(buf); 63 | 64 | ret->chars = chars; 65 | ret->words = words; 66 | ret->lines = lines; 67 | ret->clast = c; 68 | return ret; 69 | } 70 | 71 | int main(int argc, const char **argv) 72 | { 73 | off_t filesz; 74 | int i, nWorkers = 4; 75 | pthread_t *workers; 76 | struct chunk *chunks; 77 | struct counts *result, sum = {0}; 78 | struct stat in_stat; 79 | const char *path; 80 | 81 | if (argc == 3) 82 | { 83 | if (sscanf(argv[1], "%d", &nWorkers) < 1 || nWorkers < 1) 84 | { 85 | fputs("Workers argument must be integer >= 1\n", stderr); 86 | return EXIT_FAILURE; 87 | } 88 | path = argv[2]; 89 | } 90 | else 91 | { 92 | fprintf(stderr, "Usage: %s nWorkers path\n", argv[0]); 93 | return EXIT_FAILURE; 94 | } 95 | 96 | stat(path, &in_stat); 97 | filesz = in_stat.st_size; 98 | 99 | workers = malloc(nWorkers * sizeof(pthread_t)); 100 | chunks = malloc(nWorkers * sizeof(struct chunk)); 101 | for (i = 0; i < nWorkers; i++) 102 | { 103 | chunks[i].fd = open(path, O_RDONLY); 104 | chunks[i].offset = i*(filesz/nWorkers); 105 | /* due to division rounding, have the last thread read to EOF */ 106 | chunks[i].length = (i < nWorkers-1) ? filesz/nWorkers : LONG_MAX; 107 | pthread_create(&workers[i], NULL, 108 | (void *(*)(void *))count_chunk, &chunks[i]); 109 | } 110 | 111 | /* run free, threads! */ 112 | 113 | for (i = 0; i < nWorkers; i++) 114 | { 115 | pthread_join(workers[i], (void **)&result); 116 | close(chunks[i].fd); 117 | sum.lines += result->lines; 118 | sum.words += result->words; 119 | sum.chars += result->chars; 120 | /* don't double-count a word split between chunks */ 121 | if (sum.clast && !isspace(sum.clast) && !isspace(result->cfirst)) 122 | sum.words--; 123 | sum.cfirst = result->cfirst; 124 | sum.clast = result->clast; 125 | free(result); 126 | } 127 | 128 | printf("%ld %ld %ld\n", sum.lines, sum.words, sum.chars); 129 | free(workers); 130 | free(chunks); 131 | } 132 | --------------------------------------------------------------------------------