├── .gitignore
├── file1
├── file2
├── Makefile
├── COPYING.txt
├── README.md
└── align_regions.c


/.gitignore:
--------------------------------------------------------------------------------
1 | *.al
2 | align_regions
3 | 


--------------------------------------------------------------------------------
/file1:
--------------------------------------------------------------------------------
1 | a b c d e
2 | .End of Sentence
3 | .PARA
4 | f g h i
5 | .End of Sentence
6 | j k
7 | .End of Sentence
8 | .PARA
9 | 


--------------------------------------------------------------------------------
/file2:
--------------------------------------------------------------------------------
 1 | A B C
 2 | .End of Sentence
 3 | D E
 4 | .End of Sentence
 5 | .PARA
 6 | F G H I
 7 | .End of Sentence
 8 | J K
 9 | .End of Sentence
10 | .PARA
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all: file1.al file2.al
 3 | 	cat file1.al file2.al
 4 | 
 5 | file1.al file2.al: align_regions file1 file2
 6 | 	./align_regions -D '.PARA' -d '.End of Sentence' -v file1 file2
 7 | 
 8 | align_regions: align_regions.c
 9 | 	cc -std=c89 align_regions.c -lm -o align_regions 
10 | 
11 | clean:
12 | 	rm -f file1.al file1.al align_regions
13 | 
14 | 


--------------------------------------------------------------------------------
/COPYING.txt:
--------------------------------------------------------------------------------
 1 | From: Kenneth W Church <kwchurch@us.ibm.com>
 2 | Subject: Re: Code for Gale & Church (1993)
 3 | Date: 25 January 2016 at 7:25:42 AM CST
 4 | To: "Schwartz, Lane Oscar Bingaman" <lanes@illinois.edu>
 5 | 
 6 | that fine
 7 | 
 8 | good luck with your current position!
 9 | 
10 | 
11 | 
12 | 
13 | From: "Schwartz, Lane Oscar Bingaman" <lanes@illinois.edu>
14 | To: Kenneth W Church/Watson/IBM@IBMUS
15 | Date: 01/24/2016 01:35 PM
16 | Subject: Code for Gale & Church (1993)
17 | 
18 | Ken, 
19 | 
20 | You may or may not remember me - I visited JHU as a Ph.D student for the first SCALE workshop, and we briefly interacted there (I was working with Chris Callison-Burch on the Joshua machine translation system). I’m now an assistant professor in the Linguistics Department at the University of Illinois at Urbana-Champaign.
21 | 
22 | Regardless, I wanted to let you know that I finally took the time to type up your code from the appendix of Gale & Church (1993) and put it up on github. I’m a big believer in open source software, and I’ve always admired that you put the code for the sentence aligner right there in the paper. I made a couple of very minor modifications to get the code to compile on a modern C compiler.
23 | 
24 | I know that numerous other implementations of your algorithm exist, but I thought that there should be a copy of your original code in electronic form for those who wish to use the original implementation. I hope that’s OK with you.
25 | 
26 | https://github.com/dowobeha/Gale_and_Church_1993
27 | 
28 | Cheers,
29 | Lane Schwartz
30 | 
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Program for Aligning Sentences in Bilingual Corpora
 2 | 
 3 | The accompanying program `align_regions.c` was originally published in the Appendix of [Gale & Church (1993)](http://aclweb.org/anthology/J/J93/J93-1004.pdf) in the journal Computational Linguistics, Volume 19, Number 1, March 1993.
 4 | 
 5 | A small number of very minor changes have been made to the program in order to enable compilation using modern C compilers:
 6 | * Two include statements were changed, replacing deprecated headers with their modern equivalents.
 7 | * The original program also included code where a struct was passed as a parameter to a function where the function required a pointer to a struct; such code has been changed to explicitly take the address of the relevant struct object.
 8 | * Whitespace has been altered in some places to improve readability.
 9 | 
10 | This repository also includes two small sample text files and a Makefile, demonstrating how to compile and run the program. The code was transcribed from the Appendix by Lane Schwartz.
11 | 
12 | 
13 | The code was written by [William A. Gale](https://linguistlist.org/issues/13/13-2047.html) and [Kenneth W. Church](http://researcher.watson.ibm.com/researcher/view.php?person=us-kwchurch), with [Michael D. Riley](http://research.google.com/pubs/author125.html), and was presented with the following note:
14 | 
15 | > The following code is the core of align. It is a C language program that inputs two 
16 | > text files, with one token (word) per line. The text files contain a number of delimiter
17 | > tokens. There are two types of delimiter tokens: "hard" and "soft." The hard regions
18 | > (e.g., paragraphs) may not be changed, and there must be equal numbers of them in
19 | > the two input files. The soft regions (e.g., sentences) may be deleted (1-0), inserted (0-
20 | > 1), substituted (1-1), contracted (2-1), expanded (1-2), or merged (2-2) as necessary so
21 | > that the output ends up with the same number of soft regions. The program generates
22 | > two output files. The two output files contain an equal number of soft regions, each
23 | > on a line. If the -v command line option is included, each soft region is preceded by
24 | > its probability score.
25 | 


--------------------------------------------------------------------------------
/align_regions.c:
--------------------------------------------------------------------------------
  1 | #include <fcntl.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <sys/mman.h>
  7 | #include <sys/types.h>
  8 | #include <limits.h>
  9 | #include <sys/stat.h>
 10 | 
 11 | /*
 12 |   usage:
 13 |   align_regions -D '.PARA' -d '.End of Sentence'  file1 file2
 14 | 
 15 |   outputs two files: file1.al & file2.al
 16 | 
 17 |   hard regions are delimited by the -D arg
 18 |   soft regions are delimited by the -d arg
 19 |   */
 20 | 
 21 | #define dist(x,y) distances[(x) * ((ny) + 1) + (y)]
 22 | #define pathx(x,y) path_x[(x) * ((ny) + 1) + (y)]
 23 | #define pathy(x,y) path_y[(x) * ((ny) + 1) + (y)]
 24 | #define MAX_FILENAME 256
 25 | #define BIG_DISTANCE 2500
 26 | 
 27 | /* Dynamic Programming Optimization */
 28 | struct alignment {
 29 |   int x1;
 30 |   int y1;
 31 |   int x2;
 32 |   int y2;
 33 |   int d;
 34 | };
 35 | 
 36 | char *hard_delimiter = NULL;    /* -D arg */
 37 | char *soft_delimiter = NULL;    /* -d arg */
 38 | int verbose = 0;                /* -v arg */
 39 | 
 40 | /* utility functions */
 41 | char *readchars(), **readlines(), **substrings();
 42 | void err();
 43 | 
 44 | /*
 45 |   seq_align by Mike Riley
 46 | 
 47 |   x and y are sequences of objects, represented as non-zero ints,
 48 |   to be aligned.
 49 | 
 50 |   dist_funct(x1, y1, x2, y2) is a distance function of 4 args:
 51 | 
 52 |     dist_funct(x1, y1, 0, 0) gives cost of substitution of x1 by y1.
 53 |     dist_funct(x1, 0, 0, 0) gives cost of deletion of x1.
 54 |     dist_funct(0, y1, 0, 0) gives cost of insertion of y1.
 55 |     dist_funct(x1, y1, x2, 0) gives cost of contraction of (x1,x2) to y1.
 56 |     dist_funct(x1, y1, 0, y2) gives cost of expansion of x1 to (y1,y2).
 57 |     dist_funct(x1, y1, x2, y2) gives cost to match (x1,x2) to (y1,y2).
 58 | 
 59 |   align is the alignment, with (align[i].x1, align[i].x2) aligned
 60 |     with (align[i].y1, align[i].y2).  Zero in align[].x1 and align[].y1
 61 |     correspond to insertion and deletion, respectively.  Non-zero in
 62 |     align[].x2 and align[].y2 correspond to contraction and expansion,
 63 |     respectively.  align[].d gives the distance for that pairing.
 64 | 
 65 | 
 66 |   The function returns the length fo the alignment.
 67 | */
 68 | int
 69 | seq_align(x, y, nx, ny, dist_funct, align)
 70 |      int *x, *y, nx, ny;
 71 |      int (*dist_funct)();
 72 |      struct alignment **align;
 73 | {
 74 |   int *distances, *path_x, *path_y, n;
 75 |   int i, j, oi, oj, di, dj, d1, d2, d3, d4, d5, d6, dmin;
 76 |   struct alignment *ralign;
 77 | 
 78 |   distances = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
 79 |   path_x = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
 80 |   path_y = (int *) malloc((nx + 1) * (ny + 1) * sizeof(int));
 81 |   ralign = (struct alignment *) malloc((nx + ny) * sizeof(struct alignment));
 82 | 
 83 |   for(j = 0; j <= ny; j++) {
 84 |     for(i = 0; i <= nx; i++) {
 85 |       d1 = i>0 && j>0 ?         /* substitution */
 86 | 	dist(i-1, j-1) + (*dist_funct)(x[i-1], y[j-1], 0, 0)
 87 | 	  : INT_MAX;
 88 |       d2 = i>0 ?                /* deletion */
 89 | 	dist(i-1, j) + (*dist_funct)(x[i-1], 0, 0, 0)
 90 | 	  : INT_MAX;
 91 |       d3 = j>0 ?                /* insertion */
 92 | 	dist(i, j-1) + (*dist_funct)(0, y[j-1], 0, 0)
 93 | 	  : INT_MAX;
 94 |       d4 = i>1 && j>0 ?        /* contraction */
 95 | 	dist(i-2, j-1) + (*dist_funct)(x[i-2], y[j-1], x[i-1], 0)
 96 | 	  : INT_MAX;
 97 |       d5 = i>0 && j>1 ?        /* expansion */
 98 | 	dist(i-1, j-2) + (*dist_funct)(x[i-1], y[j-2], 0, y[j-1])
 99 | 	  : INT_MAX;
100 |       d6 = i>1 && j>1 ?        /* melding */
101 | 	dist(i-2, j-2) + (*dist_funct)(x[i-2], y[j-2], x[i-1], y[j-1])
102 | 	  : INT_MAX;
103 | 
104 |       dmin = d1;
105 |       if (d2<dmin) dmin=d2;
106 |       if (d3<dmin) dmin=d3;
107 |       if (d4<dmin) dmin=d4;
108 |       if (d5<dmin) dmin=d5;
109 |       if (d6<dmin) dmin=d6;
110 | 
111 |       if(dmin == INT_MAX) {
112 | 	dist(i, j) = 0;
113 |       }
114 |       else if(dmin == d1) {
115 | 	dist(i,j) = d1;
116 | 	pathx(i,j) = i-1;
117 | 	pathy(i,j) = j-1;
118 |       }
119 |       else if(dmin == d2) {
120 | 	dist(i,j) = d2;
121 | 	pathx(i,j) = i-1;
122 | 	pathy(i,j) = j;
123 |       }
124 |       else if(dmin == d3) {
125 | 	dist(i,j) = d3;
126 | 	pathx(i,j) = i;
127 | 	pathy(i,j) = j-1;
128 |       }
129 |       else if(dmin == d4) {
130 | 	dist(i,j) = d4;
131 | 	pathx(i,j) = i-2;
132 | 	pathy(i,j) = j-1;
133 |       }
134 |       else if(dmin == d5) {
135 | 	dist(i,j) = d5;
136 | 	pathx(i,j) = i-1;
137 | 	pathy(i,j) = j-2;
138 |       }
139 |       else /* dmin == d6 */ {
140 | 	dist(i,j) = d6;
141 | 	pathx(i,j) = i-2;
142 | 	pathy(i,j) = j-2;
143 |       }
144 |     }
145 |   }
146 | 
147 |   n = 0;
148 | 
149 |   for(i=nx, j=ny ; i>0|| j>0 ; i = oi, j = oj) {
150 | 
151 |     oi = pathx(i, j);
152 |     oj = pathy(i, j);
153 |     di = i - oi;
154 |     dj = j - oj;
155 | 
156 |     if(di == 1 && dj == 1) {  /* substitution */
157 |       ralign[n].x1 = x[i-1];
158 |       ralign[n].y1 = y[j-1];
159 |       ralign[n].x2 = 0;
160 |       ralign[n].y2 = 0;
161 |       ralign[n++].d = dist(i, j) - dist(i-1, j-1);
162 |     }
163 | 
164 |     else if(di == 1 && dj == 0) {  /* deletion */
165 |       ralign[n].x1 = x[i-1];
166 |       ralign[n].y1 = 0;
167 |       ralign[n].x2 = 0;
168 |       ralign[n].y2 = 0;
169 |       ralign[n++].d = dist(i, j) - dist(i-1, j);
170 |     }
171 | 
172 |     else if(di ==  0 && dj == 1) {  /* insertion */
173 |       ralign[n].x1 = 0;
174 |       ralign[n].y1 = y[j-1];
175 |       ralign[n].x2 = 0;
176 |       ralign[n].y2 = 0;
177 |       ralign[n++].d = dist(i, j) - dist(i, j-1);
178 |     }
179 | 
180 |     else if(dj == 1) {  /* contraction */
181 |       ralign[n].x1 = x[i-2];
182 |       ralign[n].y1 = y[j-1];
183 |       ralign[n].x2 = x[i-1];
184 |       ralign[n].y2 = 0;
185 |       ralign[n++].d = dist(i, j) - dist(i-2, j-1);
186 |     }
187 | 
188 |     else if(di == 1) {  /* expansion */
189 |       ralign[n].x1 = x[i-1];
190 |       ralign[n].y1 = y[j-2];
191 |       ralign[n].x2 = 0;
192 |       ralign[n].y2 = y[j-1];
193 |       ralign[n++].d = dist(i, j) - dist(i-1, j-2);
194 |     }
195 | 
196 |     else /* di == 2 && dj == 2 */ {  /* melding */
197 |       ralign[n].x1 = x[i-2];
198 |       ralign[n].y1 = y[j-2];
199 |       ralign[n].x2 = x[i-1];
200 |       ralign[n].y2 = y[j-1];
201 |       ralign[n++].d = dist(i, j) - dist(i-2, j-2);
202 |     }
203 |   }
204 | 
205 |   *align = (struct alignment *) malloc(n * sizeof(struct alignment));
206 |   
207 |   for(i=0; i<n; i++)
208 |     bcopy(ralign + i, (*align) + (n-i-1), sizeof(struct alignment));
209 | 
210 |   free(distances);
211 |   free(path_x);
212 |   free(path_y);
213 |   free(ralign);
214 | 
215 |   return(n);
216 | }
217 | 
218 | /* Local Distance Function */
219 | 
220 | /* Returns the area under a normal distribution
221 |    from -inf to z standard deviations */
222 | double
223 | pnorm(z)
224 |      double z;
225 | {
226 |   double t, pd;
227 |   t = 1/(1 + 0.2316419 * z);
228 |   pd = 1 - 0.3989423 *
229 |     exp(-z * z/2) *
230 |     ((((1.330274429 * t - 1.821255978) * t
231 |        + 1.781477937) * t - 0.356563782) * t + 0.319381530) * t;
232 |   /* see Abramowitz, M., and I. Stegun (1964), 26.2.17 p. 932 */
233 |   return(pd);
234 | }
235 | 
236 | /* Return -100 * log probability that an English sentence of length
237 |    len1 is a translation of a foreign sentence of length len2. The
238 |    probability is based on two parameters, the mean and variance of
239 |    number of foreign characters per English character.
240 | */
241 | int
242 | match(len1, len2)
243 |      int len1, len2;
244 | {
245 |   double z, pd, mean;
246 |   double c = 1;
247 |   double s2 = 6.8 ;
248 | 
249 |   if(len1==0 && len2==0) return(0);
250 |   mean = (len1 + len2/c)/2;
251 |   z = (c * len1 - len2)/sqrt(s2 * mean);
252 | 
253 |   /* Need to deal with both sides of the normal distribution */
254 |   if(z < 0) z = -z;
255 |   pd = 2 * (1 - pnorm(z));
256 | 
257 |   if(pd > 0) return((int)(-100 * log(pd)));
258 |   else return(BIG_DISTANCE);
259 | }
260 | 
261 | int
262 | two_side_distance(x1, y1, x2, y2)
263 |      int x1, y1, x2, y2;
264 | {
265 |   int penalty21 = 230;  /* -100 * log([prob of 2-1 match] / [prob of 1-1 match]) */
266 |   int penalty22 = 440;  /* -100 * log([prob of 2-2 match] / [prob of 1-1 match]) */
267 |   int penalty01 = 450;  /* -100 * log([prob of 0-1 match] / [prob of 1-1 match]) */
268 | 
269 |   if(x2 == 0 && y2 == 0)
270 | 
271 |     if(x1 == 0)                  /* insertion */
272 |       return(match(x1, y1) + penalty01);
273 | 
274 |     else if(y1 == 0)             /* deletion */
275 |       return(match(x1, y1) + penalty01);
276 | 
277 |     else return (match(x1, y1)); /* substitution */
278 | 
279 |   else if(x2 == 0)               /* expansion */
280 |     return (match(x1, y1 + y2) + penalty21);
281 | 
282 |   else if(y2 == 0)               /* contraction */
283 |     return(match(x1 + x2, y1) + penalty21);
284 | 
285 |   else                           /* merger */
286 |     return(match(x1 + x2, y1 + y2) + penalty22);
287 | }
288 | 
289 | /* Functions for Manipulating Regions */
290 | 
291 | struct region{
292 |   char **lines;
293 |   int length;
294 | };
295 | 
296 | void
297 | print_region(fd, region, score)
298 |      int score;
299 |      FILE *fd;
300 |      struct region *region;
301 | {
302 |   char **lines, **end;
303 | 
304 |   lines = region->lines;
305 |   end = lines + region->length;
306 |   for( ; lines < end ; lines++)
307 |     fprintf(fd, "%s\n", *lines);
308 | }
309 | 
310 | int
311 | length_of_a_region(region)
312 |      struct region *region;
313 | {
314 |   int result;
315 |   char **lines, **end;
316 | 
317 |   lines = region->lines;
318 |   end = lines + region->length;
319 |   result = end - lines;
320 | 
321 |   for( ; lines < end; lines++)
322 |     result += strlen(*lines);
323 |   return(result);
324 | }
325 | 
326 | int *
327 | region_lengths(regions, n)
328 |      struct region *regions;
329 |      int n;
330 | {
331 |   int i;
332 |   int *result;
333 | 
334 |   result = (int *)malloc(n * sizeof(int));
335 |   if(result == NULL) err("malloc failed");
336 | 
337 |   for(i = 0; i < n; i++)
338 |     result[i] = length_of_a_region(&regions[i]);
339 |   return(result);
340 | }
341 | 
342 | struct region *
343 | find_sub_regions(region, delimiter, len_ptr)
344 |      struct region *region;
345 |      char *delimiter;
346 |      int *len_ptr;
347 | {
348 |   struct region *result;
349 |   char **l, **lines, **end;
350 |   int n = 0;
351 | 
352 |   lines = region->lines;
353 |   end = lines + region->length;
354 | 
355 |   for(l = lines; l < end; l++) {
356 |     if(delimiter && strcmp(*l, delimiter) == 0) n++;
357 |   }
358 | 
359 |   result = (struct region *)calloc(n+1, sizeof(struct region));
360 |   if(result == NULL) err("malloc failed");
361 |   *len_ptr = n;
362 |   n = 0;
363 |   result[0].lines = lines;
364 |   for(l = lines; l < end; l++)
365 |     if(delimiter && strcmp(*l, delimiter) == 0) {
366 |       result[n].length = l - result[n].lines;
367 |       result[n+1].lines = l+1;
368 |       n++;
369 |     }
370 |   result[n].length = l - result[n].lines;
371 |   if(n != *len_ptr) {
372 |     exit(2);
373 |   }
374 |   return(result);
375 | }
376 | 
377 | /* Top Level Main Function */
378 | 
379 | int
380 | main(argc, argv)
381 | int argc;
382 | char **argv;
383 | {
384 | 
385 |   char **lines1, **lines2;
386 |   int number_of_lines1, number_of_lines2;
387 |   struct region *hard_regions1, *hard_regions2, *soft_regions1, *soft_regions2;
388 |   struct region *hard_end1, *hard_end2, tmp;
389 |   int number_of_hard_regions1;
390 |   int number_of_hard_regions2;
391 |   int number_of_soft_regions1;
392 |   int number_of_soft_regions2;
393 |   int *len1, *len2;
394 |   int c, n, i, ix, iy, prevx, prevy;
395 |   struct alignment *align, *a;
396 |   FILE *out1, *out2;
397 |   char filename[MAX_FILENAME];
398 |   extern char *optarg;
399 |   extern int optind;
400 | 
401 |   /* parse arguments */
402 |   while((c = getopt(argc, argv, "vd:D:")) != EOF)
403 |     switch(c) {
404 |     case 'v':
405 |       verbose = 1;
406 |       break;
407 |     case 'd':
408 |       soft_delimiter = strdup(optarg);
409 |       break;
410 |     case 'D':
411 |       hard_delimiter = strdup(optarg);
412 |       break;
413 |     default:
414 |       exit(2);
415 |     }
416 |   if(argc != optind + 2) err("wrong number of arguments");
417 | 
418 |   /* open output files */
419 |   sprintf(filename, "%s.al", argv[optind]);
420 |   out1 = fopen(filename, "w");
421 |   if(out1 == NULL) {
422 |     fprintf(stderr, "can't open %s\n", filename);
423 |     exit(2);
424 |   }
425 | 
426 |   sprintf(filename, "%s.al", argv[optind+1]);
427 |   out2 = fopen(filename, "w");
428 |   if(out2 == NULL) {
429 |     fprintf(stderr, "can't open %s\n", filename);
430 |     exit(2);
431 |   }
432 | 
433 |   lines1 = readlines(argv[optind], &number_of_lines1);
434 |   lines2 = readlines(argv[optind+1], &number_of_lines2);
435 | 
436 |   tmp.lines = lines1;
437 |   tmp.length = number_of_lines1;
438 |   hard_regions1 = find_sub_regions(&tmp, hard_delimiter, &number_of_hard_regions1);
439 | 
440 |   tmp.lines = lines2;
441 |   tmp.length = number_of_lines2;
442 |   hard_regions2 = find_sub_regions(&tmp, hard_delimiter, &number_of_hard_regions2);
443 | 
444 |   if(number_of_hard_regions1 != number_of_hard_regions2)
445 |     err("align_regions: input files do not contain the same number of hard regions");
446 | 
447 |   hard_end1 = hard_regions1 + number_of_hard_regions1;
448 |   hard_end2 = hard_regions2 + number_of_hard_regions2;
449 | 
450 |   for ( ; hard_regions1 < hard_end1 ; hard_regions1++, hard_regions2++) {
451 |     
452 |     soft_regions1 = find_sub_regions(&hard_regions1[0], soft_delimiter, &number_of_soft_regions1);
453 |     soft_regions2 = find_sub_regions(&hard_regions2[0], soft_delimiter, &number_of_soft_regions2);
454 | 
455 |     len1 = region_lengths(soft_regions1, number_of_soft_regions1);
456 |     len2 = region_lengths(soft_regions2, number_of_soft_regions2);
457 | 
458 |     n = seq_align(len1, len2, number_of_soft_regions1, number_of_soft_regions2, 
459 | 		  two_side_distance, &align);
460 | 
461 |     prevx = prevy = ix = iy = 0;
462 | 
463 |     for(i = 0; i < n; i++) {
464 |       a = &align[i];
465 |       if(a->x2 > 0) ix++; else if(a->x1 == 0) ix--;
466 |       if(a->y2 > 0) iy++; else if(a->y1 == 0) iy--;
467 |       if(a->x1 == 0 && a->y1 == 0 && a->x2 == 0 && a->y2 == 0) { ix++; iy++; }
468 |       ix++;
469 |       iy++;
470 |       if(verbose) {
471 | 	fprintf(out1, ".Score %d\n", a->d);
472 | 	fprintf(out2, ".Score %d\n", a->d);
473 |       }
474 | 
475 |       for ( ; prevx < ix; prevx++)
476 | 	print_region(out1, &soft_regions1[prevx], a->d);
477 | 
478 |       for ( ; prevy < iy; prevy++)
479 | 	print_region(out2, &soft_regions2[prevy], a->d);
480 |     }
481 | 
482 |     free(align);
483 |     free(soft_regions1);
484 |     free(soft_regions2);
485 |     free(len1);
486 |     free(len2);
487 |   }
488 | 
489 |   return 0;
490 |   
491 | }
492 | /* Utility Functions */
493 | 
494 | void
495 | err(msg)
496 |      char *msg;
497 | {
498 |   fprintf(stderr, "**ERROR**: %s\n", msg);
499 |   exit(2);
500 | }
501 | 
502 | /* return the contents of the file as a string
503 |    and stuff the length of this string into len_ptr */
504 | char *
505 | readchars(filename, len_ptr)
506 |      char *filename;
507 |      int *len_ptr;
508 | {
509 |   FILE *fd;
510 |   char *result;
511 |   struct stat stat_buf;
512 |   
513 |   fd = fopen(filename, "r");
514 |   if(fd == NULL) err("open failed");
515 | 
516 |   if(fstat(fileno(fd), &stat_buf) == -1)
517 |     err("stat failed");
518 | 
519 |   *len_ptr = stat_buf.st_size;
520 | 
521 |   result = malloc(*len_ptr);
522 |   if(result == NULL) err("malloc failed\n");
523 | 
524 |   if(fread(result, sizeof(char), *len_ptr, fd) != *len_ptr)
525 |     err("fread failed");
526 | 
527 |   return(result);
528 | }
529 | /* split string into a number of substrings delimited by a delimiter
530 |    character
531 |    return an array of substrings
532 |    stuff the length of this array into len_ptr */
533 | char **
534 | substrings(string, end, delimiter, len_ptr)
535 |      char *string, *end, delimiter;
536 |      int *len_ptr;
537 | {
538 |   char *s, **result;
539 |   int i = 0;
540 | 
541 |   while(string < end && *string == delimiter) string++;
542 | 
543 |   for(s = string; s < end; s++)
544 |     if(*s == delimiter) i++;
545 |   *len_ptr = i;
546 | 
547 |   result = (char **)malloc(sizeof(char *) * (i+1));
548 |   if(result == NULL) err("malloc failed");
549 | 
550 |   i = 0;
551 |   result[i++] = string;
552 |   for(s = string; s < end; s++)
553 |     if(*s == delimiter) {
554 |       result[i++] = s+1;
555 |       *s = 0;
556 |     }
557 |   i--; /*the last entry is beyond the end*/
558 |   if(i != *len_ptr) {
559 |     exit(2);
560 |   }
561 | 
562 |   return(result);
563 | }
564 | /* return an array of strings, one string for each line of the file
565 |    set len_ptr to the number of lines in the file */
566 | char **
567 | readlines(filename, len_ptr)
568 |      char *filename;
569 |      int *len_ptr;
570 | {
571 |   char *chars;
572 |   int number_of_chars;
573 |   chars = readchars(filename, &number_of_chars);
574 |   return(substrings(chars, chars + number_of_chars, '\n', len_ptr));
575 | }
576 | 


--------------------------------------------------------------------------------