> 41 | @ 42 | 43 | Possible challenges include multiple numbers in a line and trying to avoid 44 | to scan a line multiple times for the detection of the last number in it. 45 | 46 | The key data structure is `LINE`. It contains a pointer into the file, the 47 | length of the line and an `int n` for the value of the last number in that 48 | line. 49 | 50 | <>= 51 | typedef struct line { 52 | char *line; /* byte sequence */ 53 | int length; 54 | int n; /* sort key */ 55 | } LINE; 56 | 57 | @ The sort predicate on lines is defined by `cmp`. It derives an integer by 58 | subtracting the numbers belonging to the two lines being compared. The sign 59 | of the integer signals the order to the sort function. 60 | 61 | <>= 62 | static int cmp(const LINE *x, const LINE *y) 63 | { 64 | return x->n - y->n; 65 | } 66 | 67 | @ The main function reads the file into memory, scans it into lines, sorts 68 | the lines, and emits them. 69 | 70 | <

>= 71 | int main (int argc, const char * argv[]) 72 | { 73 | int n,l; 74 | LINE *lines; 75 | char *buffer; 76 | long bufsize; 77 | 78 | if (argc != 2) { 79 | fprintf(stderr,"usage: %s file\n",argv[0]); 80 | return 1; 81 | } 82 | 83 | buffer = readfile(argv[1], &bufsize); 84 | if (!buffer) 85 | return 1; 86 | if (!bufsize) 87 | return 0; /* file empty */ 88 | 89 | lines = scan_buffer(&n, buffer, bufsize); 90 | if (!lines) { 91 | perror("can't allocate"); 92 | return 1; 93 | } 94 | 95 | /* sort line array and emit each line to stdout */ 96 | mergesort(lines, n, sizeof(LINE), 97 | (int (*)(const void *, const void *))cmp); 98 | for (l=0; l>= 121 | static char *readfile(const char *name, long *bufsize) 122 | { 123 | long size, read; 124 | char *buffer; 125 | FILE *file; 126 | 127 | file = fopen(name, "rb"); 128 | if (!file) { 129 | perror("can't open file"); 130 | return NULL; 131 | } 132 | 133 | fseek(file, 0, SEEK_END); 134 | size = ftell(file); 135 | fseek(file, 0, SEEK_SET); 136 | 137 | /* we allocate one more byte that we might fill with \n */ 138 | buffer = malloc(size+1); 139 | if (!buffer) { 140 | perror("can't allocate memory"); 141 | return NULL; 142 | } 143 | read = fread(buffer, 1, size, file); 144 | if (read != size) { 145 | perror("reading from file failed"); 146 | return NULL; 147 | } 148 | fclose(file); 149 | 150 | /* if the last character in the file is not a \n we add it */ 151 | if (*(buffer+size-1) != '\n') { 152 | *(buffer+size) = '\n'; 153 | size++; 154 | } 155 | 156 | *bufsize = size; 157 | return buffer; 158 | } 159 | @ 160 | 161 | ## Parsing a buffer into lines 162 | 163 | The `scan_buffer` routine scans the buffer byte by byte and each line it finds 164 | it adds to a `LINE` struct. Since we can't know the number of lines, an 165 | initial number `m` is guessed. When it is exceeded, ``2*m`` are 166 | allocated (and again doubling if necessary). 167 | 168 | The tricky bit is to recognize the last number (and sort key) in a line and 169 | entering it into the `LINE` struct. Rather than relying to `atoi(3)` I'm using 170 | a small hack `C2I` that computes the value of a single digit and from there 171 | computes the number when we find more digits. 172 | 173 | When we find a new number we never know wether it is the last number in a 174 | line. Hence we read it but overwrite any previous result. The `outside` flag 175 | is true, if we are outside a sequence of digits and if so we can go a little 176 | faster. 177 | 178 | We don't recognize negative numbers. 179 | 180 | <>= 181 | #define C2I(c) ((c - '0')) 182 | 183 | LINE *scan_buffer(int *n, char *buffer, long bufsize) 184 | 185 | { 186 | char *c = buffer; 187 | char *line = buffer; 188 | int outside = 1; /* true iff outside of digits sequence */ 189 | int number = -1; /* last number we read */ 190 | 191 | int m = 100 + bufsize/40; /* max number of lines we can store */ 192 | LINE *lines = malloc(m*sizeof(LINE)); 193 | if (!lines) { 194 | return NULL; 195 | } 196 | *n = 0; /* number of lines read */ 197 | LINE *l = lines; /* current line */ 198 | 199 | 200 | while(c < buffer+bufsize) { 201 | switch (*c) { 202 | case '\n': 203 | /* store line */ 204 | l->line = line; 205 | l->n = number; 206 | l->length = c-line+1; 207 | (*n)++; 208 | 209 | /* prepare for next line */ 210 | number = -1; 211 | outside = 1; 212 | l++; 213 | c++; 214 | line = c; 215 | 216 | /* make room for more lines to store */ 217 | if (*n == m) { 218 | lines = realloc(lines, 2*m*sizeof(LINE)); 219 | if (!lines) { 220 | return NULL; 221 | } 222 | m *= 2; 223 | l = &lines[*n]; 224 | } 225 | 226 | break; 227 | 228 | case '0': /* scan a number */ 229 | case '1': 230 | case '2': 231 | case '3': 232 | case '4': 233 | case '5': 234 | case '6': 235 | case '7': 236 | case '8': 237 | case '9': 238 | if (outside) { 239 | number = C2I(*c); 240 | /* we are inside a digit sequence now */ 241 | outside = 0; 242 | } else { 243 | number = number * 10 + C2I(*c); 244 | } 245 | c++; 246 | break; 247 | default: 248 | outside = 1; /* outside a digit sequence */ 249 | c++; 250 | break; 251 | } 252 | } 253 | return lines; 254 | } 255 | 256 | -------------------------------------------------------------------------------- /examples/sortlines.md: -------------------------------------------------------------------------------- 1 | # Sortfile 2 | 3 | This code is for the following task: sort a textfile according to the last 4 | number in each line. This challenge was given student applicants by friends 5 | and I took it to develop a program in C with some attention to performance. 6 | 7 | ## Compiling 8 | 9 | To compile this code, use a Makefile that can be extracted from this file 10 | using 11 | 12 | lipsum tangle Makefile sortlines.lp > Makefile 13 | make 14 | 15 | <>= 16 | all: sortlines 17 | 18 | sortlines.c: sortlines.lp 19 | lipsum tangle -f cpp $@ $< > $@ 20 | 21 | sortlines: sortlines.c 22 | gcc -O -o $@ $< 23 | 24 | 25 | ## Types and Main Function 26 | 27 | The idea is to read the file into memory, to build up an array of pointers 28 | to each line in the file, to sort the array, and finally to emit the lines 29 | in sorting order using the sorted array. 30 | 31 | <>= 32 | #include 33 | #include 34 | #include 35 | 36 | <> 37 | <> 38 | <> 39 | <> 40 | <

> 41 | 42 | 43 | 44 | Possible challenges include multiple numbers in a line and trying to avoid 45 | to scan a line multiple times for the detection of the last number in it. 46 | 47 | The key data structure is `LINE`. It contains a pointer into the file, the 48 | length of the line and an `int n` for the value of the last number in that 49 | line. 50 | 51 | <>= 52 | typedef struct line { 53 | char *line; /* byte sequence */ 54 | int length; 55 | int n; /* sort key */ 56 | } LINE; 57 | 58 | 59 | The sort predicate on lines is defined by `cmp`. It derives an integer by 60 | subtracting the numbers belonging to the two lines being compared. The sign 61 | of the integer signals the order to the sort function. 62 | 63 | <>= 64 | static int cmp(const LINE *x, const LINE *y) 65 | { 66 | return x->n - y->n; 67 | } 68 | 69 | 70 | The main function reads the file into memory, scans it into lines, sorts 71 | the lines, and emits them. 72 | 73 | <

Sortfile

Compiling

Types and Main Function

Reading a file into memory

Parsing a buffer into lines